import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Lasso, Ridge, LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.cluster import KMeans
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report,recall_score,f1_score,precision_score
# import tensorflow as tf
# import keras
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
df_attr = pd.read_csv(r'G:\ETLHive Pune\Project\HR-Data Atrition Project\HR-Employee-Attrition-Table 1.csv')
pd.options.display.max_columns=200
df_attr.head()
| Attrition | Age | BusinessTravel | DailyRate | Department | DistanceFromHome | Education | EducationField | EmployeeCount | EmployeeNumber | EnvironmentSatisfaction | Gender | HourlyRate | JobInvolvement | JobLevel | JobRole | JobSatisfaction | MaritalStatus | MonthlyIncome | MonthlyRate | NumCompaniesWorked | Over18 | OverTime | PercentSalaryHike | PerformanceRating | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 41 | Travel_Rarely | 1102 | Sales | 1 | 2 | Life Sciences | 1 | 1 | 2 | Female | 94 | 3 | 2 | Sales Executive | 4 | Single | 5993 | 19479 | 8 | Y | Yes | 11 | 3 | 1 | 80 | 0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 |
| 1 | 0 | 49 | Travel_Frequently | 279 | Research & Development | 8 | 1 | Life Sciences | 1 | 2 | 3 | Male | 61 | 2 | 2 | Research Scientist | 2 | Married | 5130 | 24907 | 1 | Y | No | 23 | 4 | 4 | 80 | 1 | 10 | 3 | 3 | 10 | 7 | 1 | 7 |
| 2 | 1 | 37 | Travel_Rarely | 1373 | Research & Development | 2 | 2 | Other | 1 | 4 | 4 | Male | 92 | 2 | 1 | Laboratory Technician | 3 | Single | 2090 | 2396 | 6 | Y | Yes | 15 | 3 | 2 | 80 | 0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 |
| 3 | 0 | 33 | Travel_Frequently | 1392 | Research & Development | 3 | 4 | Life Sciences | 1 | 5 | 4 | Female | 56 | 3 | 1 | Research Scientist | 3 | Married | 2909 | 23159 | 1 | Y | Yes | 11 | 3 | 3 | 80 | 0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 |
| 4 | 0 | 27 | Travel_Rarely | 591 | Research & Development | 2 | 1 | Medical | 1 | 7 | 1 | Male | 40 | 3 | 1 | Laboratory Technician | 2 | Married | 3468 | 16632 | 9 | Y | No | 12 | 3 | 4 | 80 | 1 | 6 | 3 | 3 | 2 | 2 | 2 | 2 |
# pd.options.display.max_columns=200
df_attr.shape
(1470, 35)
# Attrition
print(pd.DataFrame(df_attr['Attrition'].value_counts()))
print('----------------------------------------------')
print('Yes :', round((237/(1233+237)*100),2),'%')
print('No :', round((1233/(1233+237)*100),2),'%')
sns.countplot(x=df_attr['Attrition'])
Attrition 0 1233 1 237 ---------------------------------------------- Yes : 16.12 % No : 83.88 %
<AxesSubplot: xlabel='Attrition', ylabel='count'>
# Age
plt.figure(figsize=(20,6))
sns.countplot(x=df_attr['Age'],hue = df_attr['Attrition'])
<AxesSubplot: xlabel='Age', ylabel='count'>
# Business Travel
print(pd.DataFrame(df_attr['BusinessTravel'].value_counts()))
print('----------------------------------------------------')
print('BusinessTravel : ', round(1043/(1043+277+150)*100,2),'%')
print('Travel_Frequently : ', round(277/(1043+277+150)*100,2),'%')
print('Non-Travel : ', round(150/(1043+277+150)*100,2),'%')
sns.countplot(x=df_attr['BusinessTravel'], hue = df_attr['Attrition'])
BusinessTravel Travel_Rarely 1043 Travel_Frequently 277 Non-Travel 150 ---------------------------------------------------- BusinessTravel : 70.95 % Travel_Frequently : 18.84 % Non-Travel : 10.2 %
<AxesSubplot: xlabel='BusinessTravel', ylabel='count'>
# Department
plt.figure(figsize=(8,4))
print(pd.DataFrame(df_attr['Department'].value_counts()))
print('----------------------------------------------')
print(pd.DataFrame(df_attr['Department'].value_counts(normalize=True)))
sns.countplot(x=df_attr['Department'], hue = df_attr['Attrition'])
Department
Research & Development 961
Sales 446
Human Resources 63
----------------------------------------------
Department
Research & Development 0.653741
Sales 0.303401
Human Resources 0.042857
<AxesSubplot: xlabel='Department', ylabel='count'>
# EducationField
plt.figure(figsize=(10,4))
print(pd.DataFrame(df_attr['EducationField'].value_counts()))
print('--------------------------------------------------')
print(pd.DataFrame(df_attr['EducationField'].value_counts(normalize=True)))
sns.countplot(x=df_attr['EducationField'], hue = df_attr['Attrition'])
EducationField
Life Sciences 606
Medical 464
Marketing 159
Technical Degree 132
Other 82
Human Resources 27
--------------------------------------------------
EducationField
Life Sciences 0.412245
Medical 0.315646
Marketing 0.108163
Technical Degree 0.089796
Other 0.055782
Human Resources 0.018367
<AxesSubplot: xlabel='EducationField', ylabel='count'>
# Gender
print(pd.DataFrame(df_attr['Gender'].value_counts()))
print('------------------------------------------')
print(pd.DataFrame(df_attr['Gender'].value_counts(normalize=True)))
sns.countplot(x=df_attr['Gender'], hue = df_attr['Attrition'])
Gender
Male 882
Female 588
------------------------------------------
Gender
Male 0.6
Female 0.4
<AxesSubplot: xlabel='Gender', ylabel='count'>
# MaritalStatus
print(pd.DataFrame(df_attr['MaritalStatus'].value_counts()))
print('------------------------------------------------')
print(pd.DataFrame(df_attr['MaritalStatus'].value_counts(normalize=True)))
sns.countplot(x=df_attr['MaritalStatus'],hue=df_attr['Attrition'])
MaritalStatus
Married 673
Single 470
Divorced 327
------------------------------------------------
MaritalStatus
Married 0.457823
Single 0.319728
Divorced 0.222449
<AxesSubplot: xlabel='MaritalStatus', ylabel='count'>
# Overtime
print(pd.DataFrame(df_attr['OverTime'].value_counts()))
print('-----------------------------------------')
print(pd.DataFrame(df_attr['OverTime'].value_counts(normalize=True)))
sns.countplot(x = df_attr['OverTime'], hue = df_attr['Attrition'])
OverTime
No 1054
Yes 416
-----------------------------------------
OverTime
No 0.717007
Yes 0.282993
<AxesSubplot: xlabel='OverTime', ylabel='count'>
df_attr.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1470 entries, 0 to 1469 Data columns (total 35 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Attrition 1470 non-null int64 1 Age 1470 non-null int64 2 BusinessTravel 1470 non-null object 3 DailyRate 1470 non-null int64 4 Department 1470 non-null object 5 DistanceFromHome 1470 non-null int64 6 Education 1470 non-null int64 7 EducationField 1470 non-null object 8 EmployeeCount 1470 non-null int64 9 EmployeeNumber 1470 non-null int64 10 EnvironmentSatisfaction 1470 non-null int64 11 Gender 1470 non-null object 12 HourlyRate 1470 non-null int64 13 JobInvolvement 1470 non-null int64 14 JobLevel 1470 non-null int64 15 JobRole 1470 non-null object 16 JobSatisfaction 1470 non-null int64 17 MaritalStatus 1470 non-null object 18 MonthlyIncome 1470 non-null int64 19 MonthlyRate 1470 non-null int64 20 NumCompaniesWorked 1470 non-null int64 21 Over18 1470 non-null object 22 OverTime 1470 non-null object 23 PercentSalaryHike 1470 non-null int64 24 PerformanceRating 1470 non-null int64 25 RelationshipSatisfaction 1470 non-null int64 26 StandardHours 1470 non-null int64 27 StockOptionLevel 1470 non-null int64 28 TotalWorkingYears 1470 non-null int64 29 TrainingTimesLastYear 1470 non-null int64 30 WorkLifeBalance 1470 non-null int64 31 YearsAtCompany 1470 non-null int64 32 YearsInCurrentRole 1470 non-null int64 33 YearsSinceLastPromotion 1470 non-null int64 34 YearsWithCurrManager 1470 non-null int64 dtypes: int64(27), object(8) memory usage: 402.1+ KB
df_attr.isna().sum()
Attrition 0 Age 0 BusinessTravel 0 DailyRate 0 Department 0 DistanceFromHome 0 Education 0 EducationField 0 EmployeeCount 0 EmployeeNumber 0 EnvironmentSatisfaction 0 Gender 0 HourlyRate 0 JobInvolvement 0 JobLevel 0 JobRole 0 JobSatisfaction 0 MaritalStatus 0 MonthlyIncome 0 MonthlyRate 0 NumCompaniesWorked 0 Over18 0 OverTime 0 PercentSalaryHike 0 PerformanceRating 0 RelationshipSatisfaction 0 StandardHours 0 StockOptionLevel 0 TotalWorkingYears 0 TrainingTimesLastYear 0 WorkLifeBalance 0 YearsAtCompany 0 YearsInCurrentRole 0 YearsSinceLastPromotion 0 YearsWithCurrManager 0 dtype: int64
cat = []
con = []
for i in df_attr.columns:
if (df_attr[i].dtypes == "object"):
cat.append(i)
else:
con.append(i)
cat
['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'Over18', 'OverTime']
con
['Attrition', 'Age', 'DailyRate', 'DistanceFromHome', 'Education', 'EmployeeCount', 'EmployeeNumber', 'EnvironmentSatisfaction', 'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager']
df_train_cat = cat
df_train_cat = df_attr[cat]
df_train_cat
| BusinessTravel | Department | EducationField | Gender | JobRole | MaritalStatus | Over18 | OverTime | |
|---|---|---|---|---|---|---|---|---|
| 0 | Travel_Rarely | Sales | Life Sciences | Female | Sales Executive | Single | Y | Yes |
| 1 | Travel_Frequently | Research & Development | Life Sciences | Male | Research Scientist | Married | Y | No |
| 2 | Travel_Rarely | Research & Development | Other | Male | Laboratory Technician | Single | Y | Yes |
| 3 | Travel_Frequently | Research & Development | Life Sciences | Female | Research Scientist | Married | Y | Yes |
| 4 | Travel_Rarely | Research & Development | Medical | Male | Laboratory Technician | Married | Y | No |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1465 | Travel_Frequently | Research & Development | Medical | Male | Laboratory Technician | Married | Y | No |
| 1466 | Travel_Rarely | Research & Development | Medical | Male | Healthcare Representative | Married | Y | No |
| 1467 | Travel_Rarely | Research & Development | Life Sciences | Male | Manufacturing Director | Married | Y | Yes |
| 1468 | Travel_Frequently | Sales | Medical | Male | Sales Executive | Married | Y | No |
| 1469 | Travel_Rarely | Research & Development | Medical | Male | Laboratory Technician | Married | Y | No |
1470 rows × 8 columns
# Visulization of categorical columns
plt.figure(figsize=(15,19))
for x1, i in enumerate(df_train_cat.columns):
if df_train_cat[i].dtypes=='object':
plt.subplot(3,3,x1+1)
sns.countplot(x=df_train_cat[i])
df_train_con = con
df_train_con = df_attr[con]
df_train_con
| Attrition | Age | DailyRate | DistanceFromHome | Education | EmployeeCount | EmployeeNumber | EnvironmentSatisfaction | HourlyRate | JobInvolvement | JobLevel | JobSatisfaction | MonthlyIncome | MonthlyRate | NumCompaniesWorked | PercentSalaryHike | PerformanceRating | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 41 | 1102 | 1 | 2 | 1 | 1 | 2 | 94 | 3 | 2 | 4 | 5993 | 19479 | 8 | 11 | 3 | 1 | 80 | 0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 |
| 1 | 0 | 49 | 279 | 8 | 1 | 1 | 2 | 3 | 61 | 2 | 2 | 2 | 5130 | 24907 | 1 | 23 | 4 | 4 | 80 | 1 | 10 | 3 | 3 | 10 | 7 | 1 | 7 |
| 2 | 1 | 37 | 1373 | 2 | 2 | 1 | 4 | 4 | 92 | 2 | 1 | 3 | 2090 | 2396 | 6 | 15 | 3 | 2 | 80 | 0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 |
| 3 | 0 | 33 | 1392 | 3 | 4 | 1 | 5 | 4 | 56 | 3 | 1 | 3 | 2909 | 23159 | 1 | 11 | 3 | 3 | 80 | 0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 |
| 4 | 0 | 27 | 591 | 2 | 1 | 1 | 7 | 1 | 40 | 3 | 1 | 2 | 3468 | 16632 | 9 | 12 | 3 | 4 | 80 | 1 | 6 | 3 | 3 | 2 | 2 | 2 | 2 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1465 | 0 | 36 | 884 | 23 | 2 | 1 | 2061 | 3 | 41 | 4 | 2 | 4 | 2571 | 12290 | 4 | 17 | 3 | 3 | 80 | 1 | 17 | 3 | 3 | 5 | 2 | 0 | 3 |
| 1466 | 0 | 39 | 613 | 6 | 1 | 1 | 2062 | 4 | 42 | 2 | 3 | 1 | 9991 | 21457 | 4 | 15 | 3 | 1 | 80 | 1 | 9 | 5 | 3 | 7 | 7 | 1 | 7 |
| 1467 | 0 | 27 | 155 | 4 | 3 | 1 | 2064 | 2 | 87 | 4 | 2 | 2 | 6142 | 5174 | 1 | 20 | 4 | 2 | 80 | 1 | 6 | 0 | 3 | 6 | 2 | 0 | 3 |
| 1468 | 0 | 49 | 1023 | 2 | 3 | 1 | 2065 | 4 | 63 | 2 | 2 | 2 | 5390 | 13243 | 2 | 14 | 3 | 4 | 80 | 0 | 17 | 3 | 2 | 9 | 6 | 0 | 8 |
| 1469 | 0 | 34 | 628 | 8 | 3 | 1 | 2068 | 2 | 82 | 4 | 2 | 3 | 4404 | 10228 | 2 | 12 | 3 | 1 | 80 | 0 | 6 | 3 | 4 | 4 | 3 | 1 | 2 |
1470 rows × 27 columns
# Visualisation of continious columns
plt.figure(figsize=(15,19))
for x1, i in enumerate(df_train_con.columns):
if df_train_con[i].dtypes=='int64' or df_train_con[i].dtypes=='float64':
plt.subplot(9,3,x1+1)
sns.boxplot(df_train_con[i])
for i in df_train_con.columns:
q1 = df_train_con[i].quantile(0.25)
q3 = df_train_con[i].quantile(0.75)
IQR = q3-q1
uppertail = q3+1.5*IQR
lowertail = q1-1.5*IQR
df_train_con.loc[(df_train_con[i]>uppertail) | (df_train_con[i]<lowertail)]
mean_1 = df_train_con[i].mean()
df_train_con.loc[(df_train_con[i]>uppertail) | (df_train_con[i]<lowertail),i]=mean_1
plt.figure(figsize=(15,19))
for x1, i in enumerate(df_train_con.columns):
if df_train_con[i].dtypes=='int64' or df_train_con[i].dtypes=='float64':
plt.subplot(9,3,x1+1)
sns.boxplot(df_train_con[i])
df_attr.skew().sort_values()
WorkLifeBalance -0.552480 JobInvolvement -0.498419 JobSatisfaction -0.329672 EnvironmentSatisfaction -0.321654 RelationshipSatisfaction -0.302828 Education -0.289681 HourlyRate -0.032311 DailyRate -0.003519 EmployeeCount 0.000000 StandardHours 0.000000 EmployeeNumber 0.016574 MonthlyRate 0.018578 Age 0.413286 TrainingTimesLastYear 0.553124 PercentSalaryHike 0.821128 YearsWithCurrManager 0.833451 YearsInCurrentRole 0.917363 DistanceFromHome 0.958118 StockOptionLevel 0.968980 JobLevel 1.025401 NumCompaniesWorked 1.026471 TotalWorkingYears 1.117172 MonthlyIncome 1.369817 YearsAtCompany 1.764529 Attrition 1.844366 PerformanceRating 1.921883 YearsSinceLastPromotion 1.984290 dtype: float64
plt.figure(figsize=(20,20))
sns.heatmap(df_attr.corr(), annot=True)
<AxesSubplot: >
# Create new/derived predictors (e.g Age group) for analysis
df_attr.head()
| Attrition | Age | BusinessTravel | DailyRate | Department | DistanceFromHome | Education | EducationField | EmployeeCount | EmployeeNumber | EnvironmentSatisfaction | Gender | HourlyRate | JobInvolvement | JobLevel | JobRole | JobSatisfaction | MaritalStatus | MonthlyIncome | MonthlyRate | NumCompaniesWorked | Over18 | OverTime | PercentSalaryHike | PerformanceRating | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 41 | Travel_Rarely | 1102 | Sales | 1 | 2 | Life Sciences | 1 | 1 | 2 | Female | 94 | 3 | 2 | Sales Executive | 4 | Single | 5993 | 19479 | 8 | Y | Yes | 11 | 3 | 1 | 80 | 0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 |
| 1 | 0 | 49 | Travel_Frequently | 279 | Research & Development | 8 | 1 | Life Sciences | 1 | 2 | 3 | Male | 61 | 2 | 2 | Research Scientist | 2 | Married | 5130 | 24907 | 1 | Y | No | 23 | 4 | 4 | 80 | 1 | 10 | 3 | 3 | 10 | 7 | 1 | 7 |
| 2 | 1 | 37 | Travel_Rarely | 1373 | Research & Development | 2 | 2 | Other | 1 | 4 | 4 | Male | 92 | 2 | 1 | Laboratory Technician | 3 | Single | 2090 | 2396 | 6 | Y | Yes | 15 | 3 | 2 | 80 | 0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 |
| 3 | 0 | 33 | Travel_Frequently | 1392 | Research & Development | 3 | 4 | Life Sciences | 1 | 5 | 4 | Female | 56 | 3 | 1 | Research Scientist | 3 | Married | 2909 | 23159 | 1 | Y | Yes | 11 | 3 | 3 | 80 | 0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 |
| 4 | 0 | 27 | Travel_Rarely | 591 | Research & Development | 2 | 1 | Medical | 1 | 7 | 1 | Male | 40 | 3 | 1 | Laboratory Technician | 2 | Married | 3468 | 16632 | 9 | Y | No | 12 | 3 | 4 | 80 | 1 | 6 | 3 | 3 | 2 | 2 | 2 | 2 |
df_attr['Age'].unique()
array([41, 49, 37, 33, 27, 32, 59, 30, 38, 36, 35, 29, 31, 34, 28, 22, 53,
24, 21, 42, 44, 46, 39, 43, 50, 26, 48, 55, 45, 56, 23, 51, 40, 54,
58, 20, 25, 19, 57, 52, 47, 18, 60], dtype=int64)
df_attr['Age'].value_counts()
35 78 34 77 31 69 36 69 29 68 32 61 30 60 33 58 38 58 40 57 37 50 27 48 28 48 42 46 39 42 45 41 41 40 26 39 46 33 44 33 43 32 50 30 24 26 25 26 47 24 49 24 55 22 48 19 51 19 53 19 52 18 54 18 22 16 56 14 58 14 23 14 21 13 20 11 59 10 19 9 18 8 60 5 57 4 Name: Age, dtype: int64
df_attr['Age'].max()
60
T = []
for i in df_attr['Age']:
if(i>15 and i<=30):
T.append('GROUP1')
elif(i>30 and i<=45):
T.append('GROUP2')
else:
T.append('GROUP3')
df_attr['AGE_GROUP']=T
df_attr.head()
| Attrition | Age | BusinessTravel | DailyRate | Department | DistanceFromHome | Education | EducationField | EmployeeCount | EmployeeNumber | EnvironmentSatisfaction | Gender | HourlyRate | JobInvolvement | JobLevel | JobRole | JobSatisfaction | MaritalStatus | MonthlyIncome | MonthlyRate | NumCompaniesWorked | Over18 | OverTime | PercentSalaryHike | PerformanceRating | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | AGE_GROUP | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 41 | Travel_Rarely | 1102 | Sales | 1 | 2 | Life Sciences | 1 | 1 | 2 | Female | 94 | 3 | 2 | Sales Executive | 4 | Single | 5993 | 19479 | 8 | Y | Yes | 11 | 3 | 1 | 80 | 0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 | GROUP2 |
| 1 | 0 | 49 | Travel_Frequently | 279 | Research & Development | 8 | 1 | Life Sciences | 1 | 2 | 3 | Male | 61 | 2 | 2 | Research Scientist | 2 | Married | 5130 | 24907 | 1 | Y | No | 23 | 4 | 4 | 80 | 1 | 10 | 3 | 3 | 10 | 7 | 1 | 7 | GROUP3 |
| 2 | 1 | 37 | Travel_Rarely | 1373 | Research & Development | 2 | 2 | Other | 1 | 4 | 4 | Male | 92 | 2 | 1 | Laboratory Technician | 3 | Single | 2090 | 2396 | 6 | Y | Yes | 15 | 3 | 2 | 80 | 0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 | GROUP2 |
| 3 | 0 | 33 | Travel_Frequently | 1392 | Research & Development | 3 | 4 | Life Sciences | 1 | 5 | 4 | Female | 56 | 3 | 1 | Research Scientist | 3 | Married | 2909 | 23159 | 1 | Y | Yes | 11 | 3 | 3 | 80 | 0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 | GROUP2 |
| 4 | 0 | 27 | Travel_Rarely | 591 | Research & Development | 2 | 1 | Medical | 1 | 7 | 1 | Male | 40 | 3 | 1 | Laboratory Technician | 2 | Married | 3468 | 16632 | 9 | Y | No | 12 | 3 | 4 | 80 | 1 | 6 | 3 | 3 | 2 | 2 | 2 | 2 | GROUP1 |
# 3. Explore the Data using Exploratory Data Analysis - for Y and all Xs
df_attr3 = pd.read_csv(r'G:\ETLHive Pune\Project\HR-Data Atrition Project\HR-Employee-Attrition-Table 1.csv')
df_attr3.head()
| Attrition | Age | BusinessTravel | DailyRate | Department | DistanceFromHome | Education | EducationField | EmployeeCount | EmployeeNumber | EnvironmentSatisfaction | Gender | HourlyRate | JobInvolvement | JobLevel | JobRole | JobSatisfaction | MaritalStatus | MonthlyIncome | MonthlyRate | NumCompaniesWorked | Over18 | OverTime | PercentSalaryHike | PerformanceRating | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 41 | Travel_Rarely | 1102 | Sales | 1 | 2 | Life Sciences | 1 | 1 | 2 | Female | 94 | 3 | 2 | Sales Executive | 4 | Single | 5993 | 19479 | 8 | Y | Yes | 11 | 3 | 1 | 80 | 0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 |
| 1 | 0 | 49 | Travel_Frequently | 279 | Research & Development | 8 | 1 | Life Sciences | 1 | 2 | 3 | Male | 61 | 2 | 2 | Research Scientist | 2 | Married | 5130 | 24907 | 1 | Y | No | 23 | 4 | 4 | 80 | 1 | 10 | 3 | 3 | 10 | 7 | 1 | 7 |
| 2 | 1 | 37 | Travel_Rarely | 1373 | Research & Development | 2 | 2 | Other | 1 | 4 | 4 | Male | 92 | 2 | 1 | Laboratory Technician | 3 | Single | 2090 | 2396 | 6 | Y | Yes | 15 | 3 | 2 | 80 | 0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 |
| 3 | 0 | 33 | Travel_Frequently | 1392 | Research & Development | 3 | 4 | Life Sciences | 1 | 5 | 4 | Female | 56 | 3 | 1 | Research Scientist | 3 | Married | 2909 | 23159 | 1 | Y | Yes | 11 | 3 | 3 | 80 | 0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 |
| 4 | 0 | 27 | Travel_Rarely | 591 | Research & Development | 2 | 1 | Medical | 1 | 7 | 1 | Male | 40 | 3 | 1 | Laboratory Technician | 2 | Married | 3468 | 16632 | 9 | Y | No | 12 | 3 | 4 | 80 | 1 | 6 | 3 | 3 | 2 | 2 | 2 | 2 |
# Seprate Categorical and continious columns
cat = []
con = []
for i in df_attr3.columns:
if (df_attr3[i].dtypes == 'object'):
cat.append(i)
else:
con.append(i)
df_attr3_cat = cat
df_attr3_cat = df_attr3[cat]
df_attr3_cat
| BusinessTravel | Department | EducationField | Gender | JobRole | MaritalStatus | Over18 | OverTime | |
|---|---|---|---|---|---|---|---|---|
| 0 | Travel_Rarely | Sales | Life Sciences | Female | Sales Executive | Single | Y | Yes |
| 1 | Travel_Frequently | Research & Development | Life Sciences | Male | Research Scientist | Married | Y | No |
| 2 | Travel_Rarely | Research & Development | Other | Male | Laboratory Technician | Single | Y | Yes |
| 3 | Travel_Frequently | Research & Development | Life Sciences | Female | Research Scientist | Married | Y | Yes |
| 4 | Travel_Rarely | Research & Development | Medical | Male | Laboratory Technician | Married | Y | No |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1465 | Travel_Frequently | Research & Development | Medical | Male | Laboratory Technician | Married | Y | No |
| 1466 | Travel_Rarely | Research & Development | Medical | Male | Healthcare Representative | Married | Y | No |
| 1467 | Travel_Rarely | Research & Development | Life Sciences | Male | Manufacturing Director | Married | Y | Yes |
| 1468 | Travel_Frequently | Sales | Medical | Male | Sales Executive | Married | Y | No |
| 1469 | Travel_Rarely | Research & Development | Medical | Male | Laboratory Technician | Married | Y | No |
1470 rows × 8 columns
df_attr3_con = con
df_attr3_con = df_attr3[con]
df_attr3_con
| Attrition | Age | DailyRate | DistanceFromHome | Education | EmployeeCount | EmployeeNumber | EnvironmentSatisfaction | HourlyRate | JobInvolvement | JobLevel | JobSatisfaction | MonthlyIncome | MonthlyRate | NumCompaniesWorked | PercentSalaryHike | PerformanceRating | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 41 | 1102 | 1 | 2 | 1 | 1 | 2 | 94 | 3 | 2 | 4 | 5993 | 19479 | 8 | 11 | 3 | 1 | 80 | 0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 |
| 1 | 0 | 49 | 279 | 8 | 1 | 1 | 2 | 3 | 61 | 2 | 2 | 2 | 5130 | 24907 | 1 | 23 | 4 | 4 | 80 | 1 | 10 | 3 | 3 | 10 | 7 | 1 | 7 |
| 2 | 1 | 37 | 1373 | 2 | 2 | 1 | 4 | 4 | 92 | 2 | 1 | 3 | 2090 | 2396 | 6 | 15 | 3 | 2 | 80 | 0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 |
| 3 | 0 | 33 | 1392 | 3 | 4 | 1 | 5 | 4 | 56 | 3 | 1 | 3 | 2909 | 23159 | 1 | 11 | 3 | 3 | 80 | 0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 |
| 4 | 0 | 27 | 591 | 2 | 1 | 1 | 7 | 1 | 40 | 3 | 1 | 2 | 3468 | 16632 | 9 | 12 | 3 | 4 | 80 | 1 | 6 | 3 | 3 | 2 | 2 | 2 | 2 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1465 | 0 | 36 | 884 | 23 | 2 | 1 | 2061 | 3 | 41 | 4 | 2 | 4 | 2571 | 12290 | 4 | 17 | 3 | 3 | 80 | 1 | 17 | 3 | 3 | 5 | 2 | 0 | 3 |
| 1466 | 0 | 39 | 613 | 6 | 1 | 1 | 2062 | 4 | 42 | 2 | 3 | 1 | 9991 | 21457 | 4 | 15 | 3 | 1 | 80 | 1 | 9 | 5 | 3 | 7 | 7 | 1 | 7 |
| 1467 | 0 | 27 | 155 | 4 | 3 | 1 | 2064 | 2 | 87 | 4 | 2 | 2 | 6142 | 5174 | 1 | 20 | 4 | 2 | 80 | 1 | 6 | 0 | 3 | 6 | 2 | 0 | 3 |
| 1468 | 0 | 49 | 1023 | 2 | 3 | 1 | 2065 | 4 | 63 | 2 | 2 | 2 | 5390 | 13243 | 2 | 14 | 3 | 4 | 80 | 0 | 17 | 3 | 2 | 9 | 6 | 0 | 8 |
| 1469 | 0 | 34 | 628 | 8 | 3 | 1 | 2068 | 2 | 82 | 4 | 2 | 3 | 4404 | 10228 | 2 | 12 | 3 | 1 | 80 | 0 | 6 | 3 | 4 | 4 | 3 | 1 | 2 |
1470 rows × 27 columns
# Visualisation of categorical columns
plt.figure(figsize=(15,19))
for x1,i in enumerate(df_attr3_cat.columns):
if df_attr3_cat[i].dtypes == 'object':
plt.subplot(3,3,x1+1)
sns.countplot(x=df_attr3_cat[i])
# Visulisation of continious columns
plt.figure(figsize=(15,19))
for x1,i in enumerate(df_attr3_con.columns):
if df_attr3_con[i].dtypes == 'int64' or df_attr3_con[i].dtypes == 'float64':
plt.subplot(9,3,x1+1)
sns.boxplot(df_attr3_con[i])
df_attr3.describe()
| Attrition | Age | DailyRate | DistanceFromHome | Education | EmployeeCount | EmployeeNumber | EnvironmentSatisfaction | HourlyRate | JobInvolvement | JobLevel | JobSatisfaction | MonthlyIncome | MonthlyRate | NumCompaniesWorked | PercentSalaryHike | PerformanceRating | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.0 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.0 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 |
| mean | 0.161224 | 36.923810 | 802.485714 | 9.192517 | 2.912925 | 1.0 | 1024.865306 | 2.721769 | 65.891156 | 2.729932 | 2.063946 | 2.728571 | 6502.931293 | 14313.103401 | 2.693197 | 15.209524 | 3.153741 | 2.712245 | 80.0 | 0.793878 | 11.279592 | 2.799320 | 2.761224 | 7.008163 | 4.229252 | 2.187755 | 4.123129 |
| std | 0.367863 | 9.135373 | 403.509100 | 8.106864 | 1.024165 | 0.0 | 602.024335 | 1.093082 | 20.329428 | 0.711561 | 1.106940 | 1.102846 | 4707.956783 | 7117.786044 | 2.498009 | 3.659938 | 0.360824 | 1.081209 | 0.0 | 0.852077 | 7.780782 | 1.289271 | 0.706476 | 6.126525 | 3.623137 | 3.222430 | 3.568136 |
| min | 0.000000 | 18.000000 | 102.000000 | 1.000000 | 1.000000 | 1.0 | 1.000000 | 1.000000 | 30.000000 | 1.000000 | 1.000000 | 1.000000 | 1009.000000 | 2094.000000 | 0.000000 | 11.000000 | 3.000000 | 1.000000 | 80.0 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 0.000000 | 30.000000 | 465.000000 | 2.000000 | 2.000000 | 1.0 | 491.250000 | 2.000000 | 48.000000 | 2.000000 | 1.000000 | 2.000000 | 2911.000000 | 8047.000000 | 1.000000 | 12.000000 | 3.000000 | 2.000000 | 80.0 | 0.000000 | 6.000000 | 2.000000 | 2.000000 | 3.000000 | 2.000000 | 0.000000 | 2.000000 |
| 50% | 0.000000 | 36.000000 | 802.000000 | 7.000000 | 3.000000 | 1.0 | 1020.500000 | 3.000000 | 66.000000 | 3.000000 | 2.000000 | 3.000000 | 4919.000000 | 14235.500000 | 2.000000 | 14.000000 | 3.000000 | 3.000000 | 80.0 | 1.000000 | 10.000000 | 3.000000 | 3.000000 | 5.000000 | 3.000000 | 1.000000 | 3.000000 |
| 75% | 0.000000 | 43.000000 | 1157.000000 | 14.000000 | 4.000000 | 1.0 | 1555.750000 | 4.000000 | 83.750000 | 3.000000 | 3.000000 | 4.000000 | 8379.000000 | 20461.500000 | 4.000000 | 18.000000 | 3.000000 | 4.000000 | 80.0 | 1.000000 | 15.000000 | 3.000000 | 3.000000 | 9.000000 | 7.000000 | 3.000000 | 7.000000 |
| max | 1.000000 | 60.000000 | 1499.000000 | 29.000000 | 5.000000 | 1.0 | 2068.000000 | 4.000000 | 100.000000 | 4.000000 | 5.000000 | 4.000000 | 19999.000000 | 26999.000000 | 9.000000 | 25.000000 | 4.000000 | 4.000000 | 80.0 | 3.000000 | 40.000000 | 6.000000 | 4.000000 | 40.000000 | 18.000000 | 15.000000 | 17.000000 |
for i in df_attr3_con.columns:
q1 = df_attr3_con[i].quantile(0.25)
q3 = df_attr3_con[i].quantile(0.75)
IQR = q3-q1
uppertail = q3+1.5*IQR
lowertail = q1-1.5*IQR
df_attr3_con.loc[(df_attr3_con[i]>uppertail) | (df_attr3_con[i]<lowertail)]
mean_1 = df_attr3_con[i].mean()
df_attr3_con.loc[(df_attr3_con[i]>uppertail) | (df_attr3_con[i]<lowertail),i]=mean_1
plt.figure(figsize=(15,19))
for x1,i in enumerate(df_attr3_con.columns):
if df_attr3_con[i].dtypes == 'int64' or df_attr3_con[i].dtypes == 'float64':
plt.subplot(9,3,x1+1)
sns.boxplot(df_attr3_con[i])
df_attr3.skew().sort_values()
WorkLifeBalance -0.552480 JobInvolvement -0.498419 JobSatisfaction -0.329672 EnvironmentSatisfaction -0.321654 RelationshipSatisfaction -0.302828 Education -0.289681 HourlyRate -0.032311 DailyRate -0.003519 EmployeeCount 0.000000 StandardHours 0.000000 EmployeeNumber 0.016574 MonthlyRate 0.018578 Age 0.413286 TrainingTimesLastYear 0.553124 PercentSalaryHike 0.821128 YearsWithCurrManager 0.833451 YearsInCurrentRole 0.917363 DistanceFromHome 0.958118 StockOptionLevel 0.968980 JobLevel 1.025401 NumCompaniesWorked 1.026471 TotalWorkingYears 1.117172 MonthlyIncome 1.369817 YearsAtCompany 1.764529 Attrition 1.844366 PerformanceRating 1.921883 YearsSinceLastPromotion 1.984290 dtype: float64
df_attr3.corr()
| Attrition | Age | DailyRate | DistanceFromHome | Education | EmployeeCount | EmployeeNumber | EnvironmentSatisfaction | HourlyRate | JobInvolvement | JobLevel | JobSatisfaction | MonthlyIncome | MonthlyRate | NumCompaniesWorked | PercentSalaryHike | PerformanceRating | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Attrition | 1.000000 | -0.159205 | -0.056652 | 0.077924 | -0.031373 | NaN | -0.010577 | -0.103369 | -0.006846 | -0.130016 | -0.169105 | -0.103481 | -0.159840 | 0.015170 | 0.043494 | -0.013478 | 0.002889 | -0.045872 | NaN | -0.137145 | -0.171063 | -0.059478 | -0.063939 | -0.134392 | -0.160545 | -0.033019 | -0.156199 |
| Age | -0.159205 | 1.000000 | 0.010661 | -0.001686 | 0.208034 | NaN | -0.010145 | 0.010146 | 0.024287 | 0.029820 | 0.509604 | -0.004892 | 0.497855 | 0.028051 | 0.299635 | 0.003634 | 0.001904 | 0.053535 | NaN | 0.037510 | 0.680381 | -0.019621 | -0.021490 | 0.311309 | 0.212901 | 0.216513 | 0.202089 |
| DailyRate | -0.056652 | 0.010661 | 1.000000 | -0.004985 | -0.016806 | NaN | -0.050990 | 0.018355 | 0.023381 | 0.046135 | 0.002966 | 0.030571 | 0.007707 | -0.032182 | 0.038153 | 0.022704 | 0.000473 | 0.007846 | NaN | 0.042143 | 0.014515 | 0.002453 | -0.037848 | -0.034055 | 0.009932 | -0.033229 | -0.026363 |
| DistanceFromHome | 0.077924 | -0.001686 | -0.004985 | 1.000000 | 0.021042 | NaN | 0.032916 | -0.016075 | 0.031131 | 0.008783 | 0.005303 | -0.003669 | -0.017014 | 0.027473 | -0.029251 | 0.040235 | 0.027110 | 0.006557 | NaN | 0.044872 | 0.004628 | -0.036942 | -0.026556 | 0.009508 | 0.018845 | 0.010029 | 0.014406 |
| Education | -0.031373 | 0.208034 | -0.016806 | 0.021042 | 1.000000 | NaN | 0.042070 | -0.027128 | 0.016775 | 0.042438 | 0.101589 | -0.011296 | 0.094961 | -0.026084 | 0.126317 | -0.011111 | -0.024539 | -0.009118 | NaN | 0.018422 | 0.148280 | -0.025100 | 0.009819 | 0.069114 | 0.060236 | 0.054254 | 0.069065 |
| EmployeeCount | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| EmployeeNumber | -0.010577 | -0.010145 | -0.050990 | 0.032916 | 0.042070 | NaN | 1.000000 | 0.017621 | 0.035179 | -0.006888 | -0.018519 | -0.046247 | -0.014829 | 0.012648 | -0.001251 | -0.012944 | -0.020359 | -0.069861 | NaN | 0.062227 | -0.014365 | 0.023603 | 0.010309 | -0.011240 | -0.008416 | -0.009019 | -0.009197 |
| EnvironmentSatisfaction | -0.103369 | 0.010146 | 0.018355 | -0.016075 | -0.027128 | NaN | 0.017621 | 1.000000 | -0.049857 | -0.008278 | 0.001212 | -0.006784 | -0.006259 | 0.037600 | 0.012594 | -0.031701 | -0.029548 | 0.007665 | NaN | 0.003432 | -0.002693 | -0.019359 | 0.027627 | 0.001458 | 0.018007 | 0.016194 | -0.004999 |
| HourlyRate | -0.006846 | 0.024287 | 0.023381 | 0.031131 | 0.016775 | NaN | 0.035179 | -0.049857 | 1.000000 | 0.042861 | -0.027853 | -0.071335 | -0.015794 | -0.015297 | 0.022157 | -0.009062 | -0.002172 | 0.001330 | NaN | 0.050263 | -0.002334 | -0.008548 | -0.004607 | -0.019582 | -0.024106 | -0.026716 | -0.020123 |
| JobInvolvement | -0.130016 | 0.029820 | 0.046135 | 0.008783 | 0.042438 | NaN | -0.006888 | -0.008278 | 0.042861 | 1.000000 | -0.012630 | -0.021476 | -0.015271 | -0.016322 | 0.015012 | -0.017205 | -0.029071 | 0.034297 | NaN | 0.021523 | -0.005533 | -0.015338 | -0.014617 | -0.021355 | 0.008717 | -0.024184 | 0.025976 |
| JobLevel | -0.169105 | 0.509604 | 0.002966 | 0.005303 | 0.101589 | NaN | -0.018519 | 0.001212 | -0.027853 | -0.012630 | 1.000000 | -0.001944 | 0.950300 | 0.039563 | 0.142501 | -0.034730 | -0.021222 | 0.021642 | NaN | 0.013984 | 0.782208 | -0.018191 | 0.037818 | 0.534739 | 0.389447 | 0.353885 | 0.375281 |
| JobSatisfaction | -0.103481 | -0.004892 | 0.030571 | -0.003669 | -0.011296 | NaN | -0.046247 | -0.006784 | -0.071335 | -0.021476 | -0.001944 | 1.000000 | -0.007157 | 0.000644 | -0.055699 | 0.020002 | 0.002297 | -0.012454 | NaN | 0.010690 | -0.020185 | -0.005779 | -0.019459 | -0.003803 | -0.002305 | -0.018214 | -0.027656 |
| MonthlyIncome | -0.159840 | 0.497855 | 0.007707 | -0.017014 | 0.094961 | NaN | -0.014829 | -0.006259 | -0.015794 | -0.015271 | 0.950300 | -0.007157 | 1.000000 | 0.034814 | 0.149515 | -0.027269 | -0.017120 | 0.025873 | NaN | 0.005408 | 0.772893 | -0.021736 | 0.030683 | 0.514285 | 0.363818 | 0.344978 | 0.344079 |
| MonthlyRate | 0.015170 | 0.028051 | -0.032182 | 0.027473 | -0.026084 | NaN | 0.012648 | 0.037600 | -0.015297 | -0.016322 | 0.039563 | 0.000644 | 0.034814 | 1.000000 | 0.017521 | -0.006429 | -0.009811 | -0.004085 | NaN | -0.034323 | 0.026442 | 0.001467 | 0.007963 | -0.023655 | -0.012815 | 0.001567 | -0.036746 |
| NumCompaniesWorked | 0.043494 | 0.299635 | 0.038153 | -0.029251 | 0.126317 | NaN | -0.001251 | 0.012594 | 0.022157 | 0.015012 | 0.142501 | -0.055699 | 0.149515 | 0.017521 | 1.000000 | -0.010238 | -0.014095 | 0.052733 | NaN | 0.030075 | 0.237639 | -0.066054 | -0.008366 | -0.118421 | -0.090754 | -0.036814 | -0.110319 |
| PercentSalaryHike | -0.013478 | 0.003634 | 0.022704 | 0.040235 | -0.011111 | NaN | -0.012944 | -0.031701 | -0.009062 | -0.017205 | -0.034730 | 0.020002 | -0.027269 | -0.006429 | -0.010238 | 1.000000 | 0.773550 | -0.040490 | NaN | 0.007528 | -0.020608 | -0.005221 | -0.003280 | -0.035991 | -0.001520 | -0.022154 | -0.011985 |
| PerformanceRating | 0.002889 | 0.001904 | 0.000473 | 0.027110 | -0.024539 | NaN | -0.020359 | -0.029548 | -0.002172 | -0.029071 | -0.021222 | 0.002297 | -0.017120 | -0.009811 | -0.014095 | 0.773550 | 1.000000 | -0.031351 | NaN | 0.003506 | 0.006744 | -0.015579 | 0.002572 | 0.003435 | 0.034986 | 0.017896 | 0.022827 |
| RelationshipSatisfaction | -0.045872 | 0.053535 | 0.007846 | 0.006557 | -0.009118 | NaN | -0.069861 | 0.007665 | 0.001330 | 0.034297 | 0.021642 | -0.012454 | 0.025873 | -0.004085 | 0.052733 | -0.040490 | -0.031351 | 1.000000 | NaN | -0.045952 | 0.024054 | 0.002497 | 0.019604 | 0.019367 | -0.015123 | 0.033493 | -0.000867 |
| StandardHours | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| StockOptionLevel | -0.137145 | 0.037510 | 0.042143 | 0.044872 | 0.018422 | NaN | 0.062227 | 0.003432 | 0.050263 | 0.021523 | 0.013984 | 0.010690 | 0.005408 | -0.034323 | 0.030075 | 0.007528 | 0.003506 | -0.045952 | NaN | 1.000000 | 0.010136 | 0.011274 | 0.004129 | 0.015058 | 0.050818 | 0.014352 | 0.024698 |
| TotalWorkingYears | -0.171063 | 0.680381 | 0.014515 | 0.004628 | 0.148280 | NaN | -0.014365 | -0.002693 | -0.002334 | -0.005533 | 0.782208 | -0.020185 | 0.772893 | 0.026442 | 0.237639 | -0.020608 | 0.006744 | 0.024054 | NaN | 0.010136 | 1.000000 | -0.035662 | 0.001008 | 0.628133 | 0.460365 | 0.404858 | 0.459188 |
| TrainingTimesLastYear | -0.059478 | -0.019621 | 0.002453 | -0.036942 | -0.025100 | NaN | 0.023603 | -0.019359 | -0.008548 | -0.015338 | -0.018191 | -0.005779 | -0.021736 | 0.001467 | -0.066054 | -0.005221 | -0.015579 | 0.002497 | NaN | 0.011274 | -0.035662 | 1.000000 | 0.028072 | 0.003569 | -0.005738 | -0.002067 | -0.004096 |
| WorkLifeBalance | -0.063939 | -0.021490 | -0.037848 | -0.026556 | 0.009819 | NaN | 0.010309 | 0.027627 | -0.004607 | -0.014617 | 0.037818 | -0.019459 | 0.030683 | 0.007963 | -0.008366 | -0.003280 | 0.002572 | 0.019604 | NaN | 0.004129 | 0.001008 | 0.028072 | 1.000000 | 0.012089 | 0.049856 | 0.008941 | 0.002759 |
| YearsAtCompany | -0.134392 | 0.311309 | -0.034055 | 0.009508 | 0.069114 | NaN | -0.011240 | 0.001458 | -0.019582 | -0.021355 | 0.534739 | -0.003803 | 0.514285 | -0.023655 | -0.118421 | -0.035991 | 0.003435 | 0.019367 | NaN | 0.015058 | 0.628133 | 0.003569 | 0.012089 | 1.000000 | 0.758754 | 0.618409 | 0.769212 |
| YearsInCurrentRole | -0.160545 | 0.212901 | 0.009932 | 0.018845 | 0.060236 | NaN | -0.008416 | 0.018007 | -0.024106 | 0.008717 | 0.389447 | -0.002305 | 0.363818 | -0.012815 | -0.090754 | -0.001520 | 0.034986 | -0.015123 | NaN | 0.050818 | 0.460365 | -0.005738 | 0.049856 | 0.758754 | 1.000000 | 0.548056 | 0.714365 |
| YearsSinceLastPromotion | -0.033019 | 0.216513 | -0.033229 | 0.010029 | 0.054254 | NaN | -0.009019 | 0.016194 | -0.026716 | -0.024184 | 0.353885 | -0.018214 | 0.344978 | 0.001567 | -0.036814 | -0.022154 | 0.017896 | 0.033493 | NaN | 0.014352 | 0.404858 | -0.002067 | 0.008941 | 0.618409 | 0.548056 | 1.000000 | 0.510224 |
| YearsWithCurrManager | -0.156199 | 0.202089 | -0.026363 | 0.014406 | 0.069065 | NaN | -0.009197 | -0.004999 | -0.020123 | 0.025976 | 0.375281 | -0.027656 | 0.344079 | -0.036746 | -0.110319 | -0.011985 | 0.022827 | -0.000867 | NaN | 0.024698 | 0.459188 | -0.004096 | 0.002759 | 0.769212 | 0.714365 | 0.510224 | 1.000000 |
df_dum_train3 = pd.get_dummies(df_attr3_cat)
df_dum_train3.head()
| BusinessTravel_Non-Travel | BusinessTravel_Travel_Frequently | BusinessTravel_Travel_Rarely | Department_Human Resources | Department_Research & Development | Department_Sales | EducationField_Human Resources | EducationField_Life Sciences | EducationField_Marketing | EducationField_Medical | EducationField_Other | EducationField_Technical Degree | Gender_Female | Gender_Male | JobRole_Healthcare Representative | JobRole_Human Resources | JobRole_Laboratory Technician | JobRole_Manager | JobRole_Manufacturing Director | JobRole_Research Director | JobRole_Research Scientist | JobRole_Sales Executive | JobRole_Sales Representative | MaritalStatus_Divorced | MaritalStatus_Married | MaritalStatus_Single | Over18_Y | OverTime_No | OverTime_Yes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 1 |
| 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 0 |
| 2 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 |
| 3 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 |
| 4 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 0 |
std_scaler = StandardScaler()
std_scaler1 = std_scaler.fit_transform(df_attr3_con)
x3 = pd.DataFrame(std_scaler1,columns=df_attr3_con.columns)
x3.head()
| Attrition | Age | DailyRate | DistanceFromHome | Education | EmployeeCount | EmployeeNumber | EnvironmentSatisfaction | HourlyRate | JobInvolvement | JobLevel | JobSatisfaction | MonthlyIncome | MonthlyRate | NumCompaniesWorked | PercentSalaryHike | PerformanceRating | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2.280906 | 0.446350 | 0.742527 | -1.010909 | -0.891688 | 0.0 | -1.701283 | -0.660531 | 1.383138 | 0.379672 | -0.057788 | 1.153254 | 0.129018 | 0.726020 | 2.529583 | -1.150554 | -0.426230 | -1.584178 | 0.0 | -1.018674 | -0.374906 | 0.312607 | -2.493820 | 0.041137 | -0.018341 | -0.795491 | 0.294570 |
| 1 | -0.438422 | 1.322365 | -1.297775 | -0.147150 | -1.868426 | 0.0 | -1.699621 | 0.254625 | -0.240677 | -1.026167 | -0.057788 | -0.660853 | -0.140791 | 1.488876 | -0.672478 | 2.129306 | 2.346151 | 1.191438 | 0.0 | 0.510149 | -0.057867 | 0.601911 | 0.338096 | 1.069787 | 0.882230 | -0.277632 | 0.888852 |
| 2 | 2.280906 | 0.008343 | 1.414363 | -0.887515 | -0.891688 | 0.0 | -1.696298 | 1.169781 | 1.284725 | -1.026167 | -0.961486 | 0.246200 | -1.091220 | -1.674841 | 1.614708 | -0.057267 | -0.426230 | -0.658973 | 0.0 | -1.018674 | -0.533426 | 0.601911 | 0.338096 | -1.501837 | -1.219103 | -0.795491 | -1.191138 |
| 3 | -0.438422 | -0.429664 | 1.461466 | -0.764121 | 1.061787 | 0.0 | -1.694636 | 1.169781 | -0.486709 | 0.379672 | -0.961486 | 0.246200 | -0.835167 | 1.243211 | -0.672478 | -1.150554 | -0.426230 | 0.266233 | 0.0 | -1.018674 | -0.374906 | 0.601911 | 0.338096 | 0.555462 | 0.882230 | 0.758085 | -1.191138 |
| 4 | -0.438422 | -1.086676 | -0.524295 | -0.887515 | -1.868426 | 0.0 | -1.691313 | -1.575686 | -1.274014 | 0.379672 | -0.961486 | -0.660853 | -0.660400 | 0.325900 | 0.102053 | -0.877232 | -0.426230 | 1.191438 | 0.0 | 0.510149 | -0.691946 | 0.601911 | 0.338096 | -0.987512 | -0.618722 | 0.240227 | -0.596855 |
train_merge3 = pd.concat([df_dum_train3,x3],axis=1)
train_merge3.head()
| BusinessTravel_Non-Travel | BusinessTravel_Travel_Frequently | BusinessTravel_Travel_Rarely | Department_Human Resources | Department_Research & Development | Department_Sales | EducationField_Human Resources | EducationField_Life Sciences | EducationField_Marketing | EducationField_Medical | EducationField_Other | EducationField_Technical Degree | Gender_Female | Gender_Male | JobRole_Healthcare Representative | JobRole_Human Resources | JobRole_Laboratory Technician | JobRole_Manager | JobRole_Manufacturing Director | JobRole_Research Director | JobRole_Research Scientist | JobRole_Sales Executive | JobRole_Sales Representative | MaritalStatus_Divorced | MaritalStatus_Married | MaritalStatus_Single | Over18_Y | OverTime_No | OverTime_Yes | Attrition | Age | DailyRate | DistanceFromHome | Education | EmployeeCount | EmployeeNumber | EnvironmentSatisfaction | HourlyRate | JobInvolvement | JobLevel | JobSatisfaction | MonthlyIncome | MonthlyRate | NumCompaniesWorked | PercentSalaryHike | PerformanceRating | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 2.280906 | 0.446350 | 0.742527 | -1.010909 | -0.891688 | 0.0 | -1.701283 | -0.660531 | 1.383138 | 0.379672 | -0.057788 | 1.153254 | 0.129018 | 0.726020 | 2.529583 | -1.150554 | -0.426230 | -1.584178 | 0.0 | -1.018674 | -0.374906 | 0.312607 | -2.493820 | 0.041137 | -0.018341 | -0.795491 | 0.294570 |
| 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | -0.438422 | 1.322365 | -1.297775 | -0.147150 | -1.868426 | 0.0 | -1.699621 | 0.254625 | -0.240677 | -1.026167 | -0.057788 | -0.660853 | -0.140791 | 1.488876 | -0.672478 | 2.129306 | 2.346151 | 1.191438 | 0.0 | 0.510149 | -0.057867 | 0.601911 | 0.338096 | 1.069787 | 0.882230 | -0.277632 | 0.888852 |
| 2 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 2.280906 | 0.008343 | 1.414363 | -0.887515 | -0.891688 | 0.0 | -1.696298 | 1.169781 | 1.284725 | -1.026167 | -0.961486 | 0.246200 | -1.091220 | -1.674841 | 1.614708 | -0.057267 | -0.426230 | -0.658973 | 0.0 | -1.018674 | -0.533426 | 0.601911 | 0.338096 | -1.501837 | -1.219103 | -0.795491 | -1.191138 |
| 3 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | -0.438422 | -0.429664 | 1.461466 | -0.764121 | 1.061787 | 0.0 | -1.694636 | 1.169781 | -0.486709 | 0.379672 | -0.961486 | 0.246200 | -0.835167 | 1.243211 | -0.672478 | -1.150554 | -0.426230 | 0.266233 | 0.0 | -1.018674 | -0.374906 | 0.601911 | 0.338096 | 0.555462 | 0.882230 | 0.758085 | -1.191138 |
| 4 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | -0.438422 | -1.086676 | -0.524295 | -0.887515 | -1.868426 | 0.0 | -1.691313 | -1.575686 | -1.274014 | 0.379672 | -0.961486 | -0.660853 | -0.660400 | 0.325900 | 0.102053 | -0.877232 | -0.426230 | 1.191438 | 0.0 | 0.510149 | -0.691946 | 0.601911 | 0.338096 | -0.987512 | -0.618722 | 0.240227 | -0.596855 |
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif3 = pd.DataFrame()
vif3['Features']=train_merge3.columns
vif3['VIF'] = [variance_inflation_factor(train_merge3.values,i) for i in range(train_merge3.shape[1])]
vif3
| Features | VIF | |
|---|---|---|
| 0 | BusinessTravel_Non-Travel | inf |
| 1 | BusinessTravel_Travel_Frequently | inf |
| 2 | BusinessTravel_Travel_Rarely | inf |
| 3 | Department_Human Resources | inf |
| 4 | Department_Research & Development | inf |
| 5 | Department_Sales | inf |
| 6 | EducationField_Human Resources | inf |
| 7 | EducationField_Life Sciences | inf |
| 8 | EducationField_Marketing | inf |
| 9 | EducationField_Medical | inf |
| 10 | EducationField_Other | inf |
| 11 | EducationField_Technical Degree | inf |
| 12 | Gender_Female | inf |
| 13 | Gender_Male | inf |
| 14 | JobRole_Healthcare Representative | inf |
| 15 | JobRole_Human Resources | inf |
| 16 | JobRole_Laboratory Technician | inf |
| 17 | JobRole_Manager | inf |
| 18 | JobRole_Manufacturing Director | inf |
| 19 | JobRole_Research Director | inf |
| 20 | JobRole_Research Scientist | inf |
| 21 | JobRole_Sales Executive | inf |
| 22 | JobRole_Sales Representative | inf |
| 23 | MaritalStatus_Divorced | inf |
| 24 | MaritalStatus_Married | inf |
| 25 | MaritalStatus_Single | inf |
| 26 | Over18_Y | 0.000000 |
| 27 | OverTime_No | inf |
| 28 | OverTime_Yes | inf |
| 29 | Attrition | 1.347585 |
| 30 | Age | 1.758788 |
| 31 | DailyRate | 1.038182 |
| 32 | DistanceFromHome | 1.038291 |
| 33 | Education | 1.087140 |
| 34 | EmployeeCount | NaN |
| 35 | EmployeeNumber | 1.035390 |
| 36 | EnvironmentSatisfaction | 1.060889 |
| 37 | HourlyRate | 1.032229 |
| 38 | JobInvolvement | 1.042954 |
| 39 | JobLevel | 6.346295 |
| 40 | JobSatisfaction | 1.046714 |
| 41 | MonthlyIncome | 2.667871 |
| 42 | MonthlyRate | 1.025837 |
| 43 | NumCompaniesWorked | 1.282556 |
| 44 | PercentSalaryHike | 2.572648 |
| 45 | PerformanceRating | 2.550537 |
| 46 | RelationshipSatisfaction | 1.037069 |
| 47 | StandardHours | NaN |
| 48 | StockOptionLevel | 2.185174 |
| 49 | TotalWorkingYears | 2.756983 |
| 50 | TrainingTimesLastYear | 1.030164 |
| 51 | WorkLifeBalance | 1.032552 |
| 52 | YearsAtCompany | 4.548332 |
| 53 | YearsInCurrentRole | 3.380085 |
| 54 | YearsSinceLastPromotion | 1.329397 |
| 55 | YearsWithCurrManager | 2.937798 |
featurestodrop = vif3.loc[vif3['VIF']>10]
droplist = featurestodrop['Features']
droplist = list(droplist)
len(droplist)
print(droplist)
['BusinessTravel_Non-Travel', 'BusinessTravel_Travel_Frequently', 'BusinessTravel_Travel_Rarely', 'Department_Human Resources', 'Department_Research & Development', 'Department_Sales', 'EducationField_Human Resources', 'EducationField_Life Sciences', 'EducationField_Marketing', 'EducationField_Medical', 'EducationField_Other', 'EducationField_Technical Degree', 'Gender_Female', 'Gender_Male', 'JobRole_Healthcare Representative', 'JobRole_Human Resources', 'JobRole_Laboratory Technician', 'JobRole_Manager', 'JobRole_Manufacturing Director', 'JobRole_Research Director', 'JobRole_Research Scientist', 'JobRole_Sales Executive', 'JobRole_Sales Representative', 'MaritalStatus_Divorced', 'MaritalStatus_Married', 'MaritalStatus_Single', 'OverTime_No', 'OverTime_Yes']
drop_list3 = ['BusinessTravel_Non-Travel', 'BusinessTravel_Travel_Frequently', 'BusinessTravel_Travel_Rarely',
'Department_Human Resources', 'Department_Research & Development', 'Department_Sales',
'EducationField_Human Resources', 'EducationField_Life Sciences', 'EducationField_Marketing',
'EducationField_Medical', 'EducationField_Other', 'EducationField_Technical Degree', 'Gender_Female',
'Gender_Male', 'JobRole_Healthcare Representative', 'JobRole_Human Resources',
'JobRole_Laboratory Technician', 'JobRole_Manager','JobRole_Manufacturing Director',
'JobRole_Research Director','JobRole_Research Scientist', 'JobRole_Sales Executive',
'JobRole_Sales Representative', 'MaritalStatus_Divorced', 'MaritalStatus_Married', 'MaritalStatus_Single',
'OverTime_No', 'OverTime_Yes']
df_final3 = train_merge3.drop(drop_list3,axis=1)
df_final3.head()
| Over18_Y | Attrition | Age | DailyRate | DistanceFromHome | Education | EmployeeCount | EmployeeNumber | EnvironmentSatisfaction | HourlyRate | JobInvolvement | JobLevel | JobSatisfaction | MonthlyIncome | MonthlyRate | NumCompaniesWorked | PercentSalaryHike | PerformanceRating | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2.280906 | 0.446350 | 0.742527 | -1.010909 | -0.891688 | 0.0 | -1.701283 | -0.660531 | 1.383138 | 0.379672 | -0.057788 | 1.153254 | 0.129018 | 0.726020 | 2.529583 | -1.150554 | -0.426230 | -1.584178 | 0.0 | -1.018674 | -0.374906 | 0.312607 | -2.493820 | 0.041137 | -0.018341 | -0.795491 | 0.294570 |
| 1 | 1 | -0.438422 | 1.322365 | -1.297775 | -0.147150 | -1.868426 | 0.0 | -1.699621 | 0.254625 | -0.240677 | -1.026167 | -0.057788 | -0.660853 | -0.140791 | 1.488876 | -0.672478 | 2.129306 | 2.346151 | 1.191438 | 0.0 | 0.510149 | -0.057867 | 0.601911 | 0.338096 | 1.069787 | 0.882230 | -0.277632 | 0.888852 |
| 2 | 1 | 2.280906 | 0.008343 | 1.414363 | -0.887515 | -0.891688 | 0.0 | -1.696298 | 1.169781 | 1.284725 | -1.026167 | -0.961486 | 0.246200 | -1.091220 | -1.674841 | 1.614708 | -0.057267 | -0.426230 | -0.658973 | 0.0 | -1.018674 | -0.533426 | 0.601911 | 0.338096 | -1.501837 | -1.219103 | -0.795491 | -1.191138 |
| 3 | 1 | -0.438422 | -0.429664 | 1.461466 | -0.764121 | 1.061787 | 0.0 | -1.694636 | 1.169781 | -0.486709 | 0.379672 | -0.961486 | 0.246200 | -0.835167 | 1.243211 | -0.672478 | -1.150554 | -0.426230 | 0.266233 | 0.0 | -1.018674 | -0.374906 | 0.601911 | 0.338096 | 0.555462 | 0.882230 | 0.758085 | -1.191138 |
| 4 | 1 | -0.438422 | -1.086676 | -0.524295 | -0.887515 | -1.868426 | 0.0 | -1.691313 | -1.575686 | -1.274014 | 0.379672 | -0.961486 | -0.660853 | -0.660400 | 0.325900 | 0.102053 | -0.877232 | -0.426230 | 1.191438 | 0.0 | 0.510149 | -0.691946 | 0.601911 | 0.338096 | -0.987512 | -0.618722 | 0.240227 | -0.596855 |
train_merge3.skew().sort_values()
OverTime_No -0.964489 BusinessTravel_Travel_Rarely -0.923992 Department_Research & Development -0.646936 WorkLifeBalance -0.552480 JobInvolvement -0.498419 Gender_Male -0.408665 JobSatisfaction -0.329672 EnvironmentSatisfaction -0.321654 RelationshipSatisfaction -0.302828 Education -0.289681 HourlyRate -0.032311 DailyRate -0.003519 TrainingTimesLastYear -0.001942 StandardHours 0.000000 Over18_Y 0.000000 EmployeeCount 0.000000 EmployeeNumber 0.016574 MonthlyRate 0.018578 MaritalStatus_Married 0.169484 EducationField_Life Sciences 0.356919 Gender_Female 0.408665 Age 0.413286 StockOptionLevel 0.512145 YearsWithCurrManager 0.652676 YearsInCurrentRole 0.686683 YearsAtCompany 0.733775 MaritalStatus_Single 0.773874 TotalWorkingYears 0.779596 EducationField_Medical 0.794118 PercentSalaryHike 0.821128 Department_Sales 0.856158 DistanceFromHome 0.958118 OverTime_Yes 0.964489 NumCompaniesWorked 0.987236 JobLevel 1.025401 MonthlyIncome 1.159989 MaritalStatus_Divorced 1.336093 JobRole_Sales Executive 1.340834 JobRole_Research Scientist 1.512214 YearsSinceLastPromotion 1.520114 BusinessTravel_Travel_Frequently 1.595067 JobRole_Laboratory Technician 1.701604 Attrition 1.844366 PerformanceRating 1.921883 EducationField_Marketing 2.525783 BusinessTravel_Non-Travel 2.632066 JobRole_Manufacturing Director 2.694844 EducationField_Technical Degree 2.872604 JobRole_Healthcare Representative 2.887251 JobRole_Manager 3.392611 JobRole_Sales Representative 3.847192 EducationField_Other 3.875119 JobRole_Research Director 3.932443 Department_Human Resources 4.518824 JobRole_Human Resources 5.035637 EducationField_Human Resources 7.181112 dtype: float64
train_merge3.corr()['Attrition']
BusinessTravel_Non-Travel -0.074457 BusinessTravel_Travel_Frequently 0.115143 BusinessTravel_Travel_Rarely -0.049538 Department_Human Resources 0.016832 Department_Research & Development -0.085293 Department_Sales 0.080855 EducationField_Human Resources 0.036466 EducationField_Life Sciences -0.032703 EducationField_Marketing 0.055781 EducationField_Medical -0.046999 EducationField_Other -0.017898 EducationField_Technical Degree 0.069355 Gender_Female -0.029453 Gender_Male 0.029453 JobRole_Healthcare Representative -0.078696 JobRole_Human Resources 0.036215 JobRole_Laboratory Technician 0.098290 JobRole_Manager -0.083316 JobRole_Manufacturing Director -0.082994 JobRole_Research Director -0.088870 JobRole_Research Scientist -0.000360 JobRole_Sales Executive 0.019774 JobRole_Sales Representative 0.157234 MaritalStatus_Divorced -0.087716 MaritalStatus_Married -0.090984 MaritalStatus_Single 0.175419 Over18_Y NaN OverTime_No -0.246118 OverTime_Yes 0.246118 Attrition 1.000000 Age -0.159205 DailyRate -0.056652 DistanceFromHome 0.077924 Education -0.031373 EmployeeCount NaN EmployeeNumber -0.010577 EnvironmentSatisfaction -0.103369 HourlyRate -0.006846 JobInvolvement -0.130016 JobLevel -0.169105 JobSatisfaction -0.103481 MonthlyIncome -0.146207 MonthlyRate 0.015170 NumCompaniesWorked 0.030383 PercentSalaryHike -0.013478 PerformanceRating 0.002889 RelationshipSatisfaction -0.045872 StandardHours NaN StockOptionLevel -0.186680 TotalWorkingYears -0.183018 TrainingTimesLastYear 0.005146 WorkLifeBalance -0.063939 YearsAtCompany -0.172690 YearsInCurrentRole -0.164386 YearsSinceLastPromotion -0.026458 YearsWithCurrManager -0.150640 Name: Attrition, dtype: float64
# 4. Explore the data using Exploratory Data Analysis- for pairs of Y and all xs
# Read the data
df_attr4 = pd.read_csv(r'G:\ETLHive Pune\Project\HR-Data Atrition Project\HR-Employee-Attrition-Table 1.csv')
df_attr4.head()
| Attrition | Age | BusinessTravel | DailyRate | Department | DistanceFromHome | Education | EducationField | EmployeeCount | EmployeeNumber | EnvironmentSatisfaction | Gender | HourlyRate | JobInvolvement | JobLevel | JobRole | JobSatisfaction | MaritalStatus | MonthlyIncome | MonthlyRate | NumCompaniesWorked | Over18 | OverTime | PercentSalaryHike | PerformanceRating | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 41 | Travel_Rarely | 1102 | Sales | 1 | 2 | Life Sciences | 1 | 1 | 2 | Female | 94 | 3 | 2 | Sales Executive | 4 | Single | 5993 | 19479 | 8 | Y | Yes | 11 | 3 | 1 | 80 | 0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 |
| 1 | 0 | 49 | Travel_Frequently | 279 | Research & Development | 8 | 1 | Life Sciences | 1 | 2 | 3 | Male | 61 | 2 | 2 | Research Scientist | 2 | Married | 5130 | 24907 | 1 | Y | No | 23 | 4 | 4 | 80 | 1 | 10 | 3 | 3 | 10 | 7 | 1 | 7 |
| 2 | 1 | 37 | Travel_Rarely | 1373 | Research & Development | 2 | 2 | Other | 1 | 4 | 4 | Male | 92 | 2 | 1 | Laboratory Technician | 3 | Single | 2090 | 2396 | 6 | Y | Yes | 15 | 3 | 2 | 80 | 0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 |
| 3 | 0 | 33 | Travel_Frequently | 1392 | Research & Development | 3 | 4 | Life Sciences | 1 | 5 | 4 | Female | 56 | 3 | 1 | Research Scientist | 3 | Married | 2909 | 23159 | 1 | Y | Yes | 11 | 3 | 3 | 80 | 0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 |
| 4 | 0 | 27 | Travel_Rarely | 591 | Research & Development | 2 | 1 | Medical | 1 | 7 | 1 | Male | 40 | 3 | 1 | Laboratory Technician | 2 | Married | 3468 | 16632 | 9 | Y | No | 12 | 3 | 4 | 80 | 1 | 6 | 3 | 3 | 2 | 2 | 2 | 2 |
# EDA
# Seperate Categorical and continious columns
cat = []
con = []
for i in df_attr4.columns:
if (df_attr4[i].dtypes == 'object'):
cat.append(i)
else:
con.append(i)
df_train_cat4 = cat
df_train_cat4 = df_attr4[cat]
df_train_cat4
| BusinessTravel | Department | EducationField | Gender | JobRole | MaritalStatus | Over18 | OverTime | |
|---|---|---|---|---|---|---|---|---|
| 0 | Travel_Rarely | Sales | Life Sciences | Female | Sales Executive | Single | Y | Yes |
| 1 | Travel_Frequently | Research & Development | Life Sciences | Male | Research Scientist | Married | Y | No |
| 2 | Travel_Rarely | Research & Development | Other | Male | Laboratory Technician | Single | Y | Yes |
| 3 | Travel_Frequently | Research & Development | Life Sciences | Female | Research Scientist | Married | Y | Yes |
| 4 | Travel_Rarely | Research & Development | Medical | Male | Laboratory Technician | Married | Y | No |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1465 | Travel_Frequently | Research & Development | Medical | Male | Laboratory Technician | Married | Y | No |
| 1466 | Travel_Rarely | Research & Development | Medical | Male | Healthcare Representative | Married | Y | No |
| 1467 | Travel_Rarely | Research & Development | Life Sciences | Male | Manufacturing Director | Married | Y | Yes |
| 1468 | Travel_Frequently | Sales | Medical | Male | Sales Executive | Married | Y | No |
| 1469 | Travel_Rarely | Research & Development | Medical | Male | Laboratory Technician | Married | Y | No |
1470 rows × 8 columns
df_train_con4 = con
df_train_con4 = df_attr4[con]
df_train_con4
| Attrition | Age | DailyRate | DistanceFromHome | Education | EmployeeCount | EmployeeNumber | EnvironmentSatisfaction | HourlyRate | JobInvolvement | JobLevel | JobSatisfaction | MonthlyIncome | MonthlyRate | NumCompaniesWorked | PercentSalaryHike | PerformanceRating | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 41 | 1102 | 1 | 2 | 1 | 1 | 2 | 94 | 3 | 2 | 4 | 5993 | 19479 | 8 | 11 | 3 | 1 | 80 | 0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 |
| 1 | 0 | 49 | 279 | 8 | 1 | 1 | 2 | 3 | 61 | 2 | 2 | 2 | 5130 | 24907 | 1 | 23 | 4 | 4 | 80 | 1 | 10 | 3 | 3 | 10 | 7 | 1 | 7 |
| 2 | 1 | 37 | 1373 | 2 | 2 | 1 | 4 | 4 | 92 | 2 | 1 | 3 | 2090 | 2396 | 6 | 15 | 3 | 2 | 80 | 0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 |
| 3 | 0 | 33 | 1392 | 3 | 4 | 1 | 5 | 4 | 56 | 3 | 1 | 3 | 2909 | 23159 | 1 | 11 | 3 | 3 | 80 | 0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 |
| 4 | 0 | 27 | 591 | 2 | 1 | 1 | 7 | 1 | 40 | 3 | 1 | 2 | 3468 | 16632 | 9 | 12 | 3 | 4 | 80 | 1 | 6 | 3 | 3 | 2 | 2 | 2 | 2 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1465 | 0 | 36 | 884 | 23 | 2 | 1 | 2061 | 3 | 41 | 4 | 2 | 4 | 2571 | 12290 | 4 | 17 | 3 | 3 | 80 | 1 | 17 | 3 | 3 | 5 | 2 | 0 | 3 |
| 1466 | 0 | 39 | 613 | 6 | 1 | 1 | 2062 | 4 | 42 | 2 | 3 | 1 | 9991 | 21457 | 4 | 15 | 3 | 1 | 80 | 1 | 9 | 5 | 3 | 7 | 7 | 1 | 7 |
| 1467 | 0 | 27 | 155 | 4 | 3 | 1 | 2064 | 2 | 87 | 4 | 2 | 2 | 6142 | 5174 | 1 | 20 | 4 | 2 | 80 | 1 | 6 | 0 | 3 | 6 | 2 | 0 | 3 |
| 1468 | 0 | 49 | 1023 | 2 | 3 | 1 | 2065 | 4 | 63 | 2 | 2 | 2 | 5390 | 13243 | 2 | 14 | 3 | 4 | 80 | 0 | 17 | 3 | 2 | 9 | 6 | 0 | 8 |
| 1469 | 0 | 34 | 628 | 8 | 3 | 1 | 2068 | 2 | 82 | 4 | 2 | 3 | 4404 | 10228 | 2 | 12 | 3 | 1 | 80 | 0 | 6 | 3 | 4 | 4 | 3 | 1 | 2 |
1470 rows × 27 columns
# Visulization of categorical columns
plt.figure(figsize=(15,19))
for x1,i in enumerate(df_train_cat4.columns):
if df_train_cat4[i].dtypes == 'object':
plt.subplot(3,3,x1+1)
sns.countplot(x=df_train_cat4[i])
# Visulisation of continous columns
plt.figure(figsize=(17,19))
for x1,i in enumerate(df_train_con4.columns):
if df_train_con4[i].dtypes == 'int64' or df_train_con4[i].dtypes == 'float64':
plt.subplot(9,3,x1+1)
sns.boxplot(df_train_con4[i])
for i in df_train_con4.columns:
q1 = df_train_con4[i].quantile(0.25)
q2 = df_train_con4[i].quantile(0.75)
IQR = q3-q1
uppertail = q3+1.5*IQR
lowertail = q1-1.5*IQR
df_train_con4.loc[(df_train_con4[i]>uppertail) | (df_train_con4[i]<lowertail)]
mean_1 = df_train_con4[i].mean()
df_train_con4.loc[(df_train_con4[i]>uppertail) | (df_train_con4[i]<lowertail),i]=mean_1
plt.figure(figsize=(17,19))
for x1,i in enumerate(df_train_con4.columns):
if df_train_con4[i].dtypes == 'int64' or df_train_con4[i].dtypes == 'float64':
plt.subplot(9,3,x1+1)
sns.boxplot(df_train_con4[i])
df_dum_train4 = pd.get_dummies(df_train_cat4)
df_dum_train4
| BusinessTravel_Non-Travel | BusinessTravel_Travel_Frequently | BusinessTravel_Travel_Rarely | Department_Human Resources | Department_Research & Development | Department_Sales | EducationField_Human Resources | EducationField_Life Sciences | EducationField_Marketing | EducationField_Medical | EducationField_Other | EducationField_Technical Degree | Gender_Female | Gender_Male | JobRole_Healthcare Representative | JobRole_Human Resources | JobRole_Laboratory Technician | JobRole_Manager | JobRole_Manufacturing Director | JobRole_Research Director | JobRole_Research Scientist | JobRole_Sales Executive | JobRole_Sales Representative | MaritalStatus_Divorced | MaritalStatus_Married | MaritalStatus_Single | Over18_Y | OverTime_No | OverTime_Yes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 1 |
| 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 0 |
| 2 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 |
| 3 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 |
| 4 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1465 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 0 |
| 1466 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 0 |
| 1467 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 |
| 1468 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 |
| 1469 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 0 |
1470 rows × 29 columns
std_scaler = StandardScaler()
std_scaler1 = std_scaler.fit_transform(df_train_con4)
x4 = pd.DataFrame(std_scaler1,columns=df_train_con4.columns)
x4.head()
| Attrition | Age | DailyRate | DistanceFromHome | Education | EmployeeCount | EmployeeNumber | EnvironmentSatisfaction | HourlyRate | JobInvolvement | JobLevel | JobSatisfaction | MonthlyIncome | MonthlyRate | NumCompaniesWorked | PercentSalaryHike | PerformanceRating | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2.280906 | 7.105427e-15 | 0.0 | -1.412470 | -0.891688 | 0.0 | 4.547474e-13 | -0.660531 | -2.842171e-14 | 0.379672 | -0.057788 | 1.153254 | 0.0 | 1.818989e-12 | 2.125136 | 3.552714e-15 | -0.426230 | -1.584178 | 0.0 | -0.932014 | -0.905232 | -2.171982 | -2.493820 | 0.184086 | -0.018341 | -0.679146 | 0.294570 |
| 1 | -0.438422 | 7.105427e-15 | 0.0 | 0.509209 | -1.868426 | 0.0 | 4.547474e-13 | 0.254625 | -2.842171e-14 | -1.026167 | -0.057788 | -0.660853 | 0.0 | 1.818989e-12 | -0.678049 | 3.552714e-15 | 2.346151 | 1.191438 | 0.0 | 0.241988 | 0.587285 | 0.155707 | 0.338096 | 1.440441 | 0.882230 | -0.368715 | 0.888852 |
| 2 | 2.280906 | 7.105427e-15 | 0.0 | -1.137945 | -0.891688 | 0.0 | 4.547474e-13 | 1.169781 | -2.842171e-14 | -1.026167 | -0.961486 | 0.246200 | 0.0 | 1.818989e-12 | 1.324226 | 3.552714e-15 | -0.426230 | -0.658973 | 0.0 | -0.932014 | -1.360324 | 0.155707 | 0.338096 | -1.700445 | -1.219103 | -0.679146 | -1.191138 |
| 3 | -0.438422 | 7.105427e-15 | 0.0 | -0.863419 | 1.061787 | 0.0 | 4.547474e-13 | 1.169781 | -2.842171e-14 | 0.379672 | -0.961486 | 0.246200 | 0.0 | 1.818989e-12 | -0.678049 | 3.552714e-15 | -0.426230 | 0.266233 | 0.0 | -0.932014 | -0.905232 | 0.155707 | 0.338096 | 0.812264 | 0.882230 | 0.252146 | -1.191138 |
| 4 | -0.438422 | 7.105427e-15 | 0.0 | -1.137945 | -1.868426 | 0.0 | 4.547474e-13 | -1.575686 | -2.842171e-14 | 0.379672 | -0.961486 | -0.660853 | 0.0 | 1.818989e-12 | 2.525591 | 3.552714e-15 | -0.426230 | 1.191438 | 0.0 | 0.241988 | -1.815417 | 0.155707 | 0.338096 | -1.072268 | -0.618722 | -0.058285 | -0.596855 |
df_train_merge4 = pd.concat([df_dum_train4,x4],axis=1)
df_train_merge4.head()
| BusinessTravel_Non-Travel | BusinessTravel_Travel_Frequently | BusinessTravel_Travel_Rarely | Department_Human Resources | Department_Research & Development | Department_Sales | EducationField_Human Resources | EducationField_Life Sciences | EducationField_Marketing | EducationField_Medical | EducationField_Other | EducationField_Technical Degree | Gender_Female | Gender_Male | JobRole_Healthcare Representative | JobRole_Human Resources | JobRole_Laboratory Technician | JobRole_Manager | JobRole_Manufacturing Director | JobRole_Research Director | JobRole_Research Scientist | JobRole_Sales Executive | JobRole_Sales Representative | MaritalStatus_Divorced | MaritalStatus_Married | MaritalStatus_Single | Over18_Y | OverTime_No | OverTime_Yes | Attrition | Age | DailyRate | DistanceFromHome | Education | EmployeeCount | EmployeeNumber | EnvironmentSatisfaction | HourlyRate | JobInvolvement | JobLevel | JobSatisfaction | MonthlyIncome | MonthlyRate | NumCompaniesWorked | PercentSalaryHike | PerformanceRating | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 2.280906 | 7.105427e-15 | 0.0 | -1.412470 | -0.891688 | 0.0 | 4.547474e-13 | -0.660531 | -2.842171e-14 | 0.379672 | -0.057788 | 1.153254 | 0.0 | 1.818989e-12 | 2.125136 | 3.552714e-15 | -0.426230 | -1.584178 | 0.0 | -0.932014 | -0.905232 | -2.171982 | -2.493820 | 0.184086 | -0.018341 | -0.679146 | 0.294570 |
| 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | -0.438422 | 7.105427e-15 | 0.0 | 0.509209 | -1.868426 | 0.0 | 4.547474e-13 | 0.254625 | -2.842171e-14 | -1.026167 | -0.057788 | -0.660853 | 0.0 | 1.818989e-12 | -0.678049 | 3.552714e-15 | 2.346151 | 1.191438 | 0.0 | 0.241988 | 0.587285 | 0.155707 | 0.338096 | 1.440441 | 0.882230 | -0.368715 | 0.888852 |
| 2 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 2.280906 | 7.105427e-15 | 0.0 | -1.137945 | -0.891688 | 0.0 | 4.547474e-13 | 1.169781 | -2.842171e-14 | -1.026167 | -0.961486 | 0.246200 | 0.0 | 1.818989e-12 | 1.324226 | 3.552714e-15 | -0.426230 | -0.658973 | 0.0 | -0.932014 | -1.360324 | 0.155707 | 0.338096 | -1.700445 | -1.219103 | -0.679146 | -1.191138 |
| 3 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | -0.438422 | 7.105427e-15 | 0.0 | -0.863419 | 1.061787 | 0.0 | 4.547474e-13 | 1.169781 | -2.842171e-14 | 0.379672 | -0.961486 | 0.246200 | 0.0 | 1.818989e-12 | -0.678049 | 3.552714e-15 | -0.426230 | 0.266233 | 0.0 | -0.932014 | -0.905232 | 0.155707 | 0.338096 | 0.812264 | 0.882230 | 0.252146 | -1.191138 |
| 4 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | -0.438422 | 7.105427e-15 | 0.0 | -1.137945 | -1.868426 | 0.0 | 4.547474e-13 | -1.575686 | -2.842171e-14 | 0.379672 | -0.961486 | -0.660853 | 0.0 | 1.818989e-12 | 2.525591 | 3.552714e-15 | -0.426230 | 1.191438 | 0.0 | 0.241988 | -1.815417 | 0.155707 | 0.338096 | -1.072268 | -0.618722 | -0.058285 | -0.596855 |
df_train_merge4.corr()
| BusinessTravel_Non-Travel | BusinessTravel_Travel_Frequently | BusinessTravel_Travel_Rarely | Department_Human Resources | Department_Research & Development | Department_Sales | EducationField_Human Resources | EducationField_Life Sciences | EducationField_Marketing | EducationField_Medical | EducationField_Other | EducationField_Technical Degree | Gender_Female | Gender_Male | JobRole_Healthcare Representative | JobRole_Human Resources | JobRole_Laboratory Technician | JobRole_Manager | JobRole_Manufacturing Director | JobRole_Research Director | JobRole_Research Scientist | JobRole_Sales Executive | JobRole_Sales Representative | MaritalStatus_Divorced | MaritalStatus_Married | MaritalStatus_Single | Over18_Y | OverTime_No | OverTime_Yes | Attrition | Age | DailyRate | DistanceFromHome | Education | EmployeeCount | EmployeeNumber | EnvironmentSatisfaction | HourlyRate | JobInvolvement | JobLevel | JobSatisfaction | MonthlyIncome | MonthlyRate | NumCompaniesWorked | PercentSalaryHike | PerformanceRating | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| BusinessTravel_Non-Travel | 1.000000 | -0.162435 | -0.526850 | -0.004755 | -0.005013 | 0.007283 | 0.020835 | 0.005311 | -0.030567 | 0.012828 | -0.013389 | 0.004171 | -0.050461 | 0.050461 | 0.012878 | -0.015890 | 0.009270 | 0.014078 | -0.013536 | -0.021431 | -0.010116 | 0.031022 | -0.033780 | 0.057455 | -0.043635 | -0.004622 | NaN | 0.037163 | -0.037163 | -0.074457 | NaN | NaN | 0.008097 | 0.004524 | NaN | NaN | 0.003568 | NaN | -0.045779 | -0.007295 | 0.019802 | NaN | NaN | 0.002718 | NaN | 0.018310 | 0.021132 | NaN | 0.028807 | 0.018476 | -0.020746 | 0.005780 | 0.023331 | 0.014365 | 0.020815 | 0.029183 |
| BusinessTravel_Travel_Frequently | -0.162435 | 1.000000 | -0.753092 | -0.007485 | 0.003340 | -0.000160 | 0.011818 | 0.031128 | -0.016586 | -0.005367 | -0.011004 | -0.023569 | 0.022015 | -0.022015 | 0.008029 | 0.001896 | 0.010023 | -0.042583 | 0.009783 | -0.023579 | -0.004461 | -0.010175 | 0.055469 | 0.005779 | -0.030785 | 0.027734 | NaN | -0.029392 | 0.029392 | 0.115143 | NaN | NaN | -0.022222 | -0.008292 | NaN | NaN | -0.012624 | NaN | 0.004424 | -0.021557 | 0.027117 | NaN | NaN | -0.039718 | NaN | 0.016463 | 0.028500 | NaN | -0.016142 | 0.006153 | 0.006193 | 0.010199 | 0.021100 | 0.013334 | 0.023216 | 0.029774 |
| BusinessTravel_Travel_Rarely | -0.526850 | -0.753092 | 1.000000 | 0.009618 | 0.000465 | -0.004718 | -0.024073 | -0.030355 | 0.034668 | -0.003930 | 0.018406 | 0.017521 | 0.014682 | -0.014682 | -0.015503 | 0.008962 | -0.014815 | 0.027294 | 0.000598 | 0.034600 | 0.010588 | -0.011920 | -0.025257 | -0.043287 | 0.055613 | -0.020808 | NaN | 0.000539 | -0.000539 | -0.049538 | NaN | NaN | 0.013743 | 0.004126 | NaN | NaN | 0.008496 | NaN | 0.026714 | 0.023433 | -0.036562 | NaN | NaN | 0.032401 | NaN | -0.026390 | -0.038640 | NaN | -0.005303 | -0.017619 | 0.008498 | -0.012640 | -0.033732 | -0.021064 | -0.033877 | -0.045106 |
| Department_Human Resources | -0.004755 | -0.007485 | 0.009618 | 1.000000 | -0.290754 | -0.139650 | 0.646436 | -0.068040 | -0.073692 | -0.049761 | -0.007527 | -0.019469 | -0.035652 | 0.035652 | -0.066186 | 0.904983 | -0.097859 | 0.087615 | -0.070000 | -0.050765 | -0.105352 | -0.112959 | -0.051764 | 0.016037 | 0.034767 | -0.051443 | NaN | 0.006178 | -0.006178 | 0.016832 | NaN | NaN | -0.019777 | 0.011435 | NaN | NaN | -0.007597 | NaN | 0.004789 | -0.006157 | -0.024068 | NaN | NaN | 0.020618 | NaN | -0.006385 | 0.034583 | NaN | -0.004000 | -0.007662 | -0.040022 | 0.047763 | -0.005295 | -0.033121 | -0.026931 | -0.021503 |
| Department_Research & Development | -0.005013 | 0.003340 | 0.000465 | -0.290754 | 1.000000 | -0.906818 | -0.187954 | 0.127321 | -0.478520 | 0.183548 | 0.064751 | 0.038541 | -0.015760 | 0.015760 | 0.227637 | -0.263128 | 0.336570 | -0.071356 | 0.240754 | 0.174596 | 0.362340 | -0.733497 | -0.336127 | 0.035158 | -0.019997 | -0.009990 | NaN | 0.003036 | -0.003036 | -0.085293 | NaN | NaN | -0.026238 | -0.018604 | NaN | NaN | 0.027976 | NaN | 0.023187 | -0.107830 | -0.002798 | NaN | NaN | 0.022237 | NaN | 0.032720 | -0.004587 | NaN | 0.016927 | -0.004442 | -0.006819 | -0.069922 | -0.036307 | -0.037461 | -0.021497 | -0.024626 |
| Department_Sales | 0.007283 | -0.000160 | -0.004718 | -0.139650 | -0.906818 | 1.000000 | -0.090275 | -0.101791 | 0.527691 | -0.168034 | -0.063695 | -0.031309 | 0.032017 | -0.032017 | -0.206425 | -0.126381 | -0.305208 | 0.035248 | -0.218320 | -0.158327 | -0.328576 | 0.808869 | 0.370667 | -0.043451 | 0.005378 | 0.033002 | NaN | -0.005864 | 0.005864 | 0.080855 | NaN | NaN | 0.035867 | 0.014215 | NaN | NaN | -0.025606 | NaN | -0.026107 | 0.114307 | 0.013499 | NaN | NaN | -0.032097 | NaN | -0.031050 | -0.010489 | NaN | -0.015755 | 0.007973 | 0.024688 | 0.051320 | 0.039907 | 0.053360 | 0.034112 | 0.034959 |
| EducationField_Human Resources | 0.020835 | 0.011818 | -0.024073 | 0.646436 | -0.187954 | -0.090275 | 1.000000 | -0.114559 | -0.047637 | -0.092899 | -0.033248 | -0.042964 | -0.028956 | 0.028956 | -0.042785 | 0.549751 | -0.063260 | 0.082271 | -0.045251 | -0.032816 | -0.068103 | -0.073020 | -0.033462 | 0.012107 | 0.057339 | -0.072051 | NaN | -0.004040 | 0.004040 | 0.036466 | NaN | NaN | -0.005234 | 0.026479 | NaN | NaN | -0.006898 | NaN | 0.002079 | 0.010409 | -0.021467 | NaN | NaN | 0.031007 | NaN | -0.016167 | 0.041105 | NaN | 0.021206 | 0.004348 | -0.037664 | -0.003967 | -0.008196 | -0.020759 | -0.023700 | -0.025943 |
| EducationField_Life Sciences | 0.005311 | 0.031128 | -0.030355 | -0.068040 | 0.127321 | -0.101791 | -0.114559 | 1.000000 | -0.291660 | -0.568774 | -0.203560 | -0.263050 | -0.006770 | 0.006770 | 0.029084 | -0.063119 | 0.044359 | -0.011143 | 0.052023 | 0.018401 | 0.043729 | -0.091122 | -0.043208 | -0.002672 | -0.017866 | 0.021469 | NaN | 0.013787 | -0.013787 | -0.032703 | NaN | NaN | -0.045394 | 0.013184 | NaN | NaN | -0.024526 | NaN | 0.003228 | -0.008431 | 0.052004 | NaN | NaN | -0.006131 | NaN | 0.010853 | -0.019973 | NaN | -0.017993 | 0.047855 | -0.039018 | -0.039728 | -0.013111 | 0.004035 | -0.002480 | 0.003488 |
| EducationField_Marketing | -0.030567 | -0.016586 | 0.034668 | -0.073692 | -0.478520 | 0.527691 | -0.047637 | -0.291660 | 1.000000 | -0.236514 | -0.084647 | -0.109385 | 0.024143 | -0.024143 | -0.108929 | -0.066690 | -0.161055 | 0.025577 | -0.115206 | -0.083548 | -0.173387 | 0.457308 | 0.133065 | -0.007212 | 0.018491 | -0.013323 | NaN | -0.014607 | 0.014607 | 0.055781 | NaN | NaN | 0.054723 | 0.072405 | NaN | NaN | 0.000479 | NaN | -0.018657 | 0.092698 | -0.023528 | NaN | NaN | -0.018611 | NaN | -0.020918 | -0.006580 | NaN | 0.022560 | 0.029466 | -0.029046 | 0.018500 | 0.016770 | 0.037816 | 0.006219 | 0.022774 |
| EducationField_Medical | 0.012828 | -0.005367 | -0.003930 | -0.049761 | 0.183548 | -0.168034 | -0.092899 | -0.568774 | -0.236514 | 1.000000 | -0.165072 | -0.213314 | 0.013146 | -0.013146 | 0.034165 | -0.042895 | 0.066262 | -0.001128 | 0.035496 | 0.062898 | 0.039735 | -0.133532 | -0.051990 | 0.013316 | -0.007139 | -0.004249 | NaN | -0.002246 | 0.002246 | -0.046999 | NaN | NaN | 0.021864 | -0.072335 | NaN | NaN | -0.021299 | NaN | 0.017103 | -0.014114 | -0.022645 | NaN | NaN | 0.024826 | NaN | 0.014868 | 0.030494 | NaN | 0.033750 | -0.050973 | 0.070542 | 0.001641 | 0.009381 | -0.020740 | 0.022665 | -0.005065 |
| EducationField_Other | -0.013389 | -0.011004 | 0.018406 | -0.007527 | 0.064751 | -0.063695 | -0.033248 | -0.203560 | -0.084647 | -0.165072 | 1.000000 | -0.076343 | -0.022992 | 0.022992 | 0.017609 | 0.001594 | 0.058759 | -0.008046 | -0.010820 | -0.006044 | 0.005286 | -0.036995 | -0.033774 | 0.005411 | -0.009171 | 0.004972 | NaN | -0.024970 | 0.024970 | -0.017898 | NaN | NaN | 0.027156 | 0.038043 | NaN | NaN | 0.064602 | NaN | -0.011895 | -0.016724 | 0.003380 | NaN | NaN | -0.012870 | NaN | 0.011449 | -0.020305 | NaN | -0.042100 | -0.039133 | -0.008151 | 0.031812 | -0.022235 | -0.006238 | -0.039931 | -0.015490 |
| EducationField_Technical Degree | 0.004171 | -0.023569 | 0.017521 | -0.019469 | 0.038541 | -0.031309 | -0.042964 | -0.263050 | -0.109385 | -0.213314 | -0.076343 | 1.000000 | -0.003886 | 0.003886 | 0.018681 | -0.008623 | -0.026589 | -0.038946 | 0.007817 | -0.022905 | 0.076218 | -0.058843 | 0.057185 | -0.019243 | 0.002710 | 0.014265 | NaN | 0.017723 | -0.017723 | 0.069355 | NaN | NaN | -0.036177 | -0.026742 | NaN | NaN | 0.027713 | NaN | -0.004519 | -0.054707 | -0.019795 | NaN | NaN | -0.013819 | NaN | -0.021729 | -0.011044 | NaN | -0.024560 | -0.002168 | 0.008289 | 0.021962 | 0.010803 | 0.000445 | 0.003853 | 0.002107 |
| Gender_Female | -0.050461 | 0.022015 | 0.014682 | -0.035652 | -0.015760 | 0.032017 | -0.028956 | -0.006770 | 0.024143 | 0.013146 | -0.022992 | -0.003886 | 1.000000 | -1.000000 | -0.006823 | -0.036082 | -0.067793 | 0.033880 | 0.065197 | 0.006121 | -0.009745 | 0.005348 | 0.028877 | -0.046076 | 0.007804 | 0.032752 | NaN | -0.041924 | 0.041924 | -0.029453 | NaN | NaN | -0.003885 | 0.016547 | NaN | NaN | -0.000508 | NaN | -0.017960 | 0.039403 | -0.033252 | NaN | NaN | 0.039147 | NaN | 0.013859 | -0.022868 | NaN | -0.012716 | 0.040533 | 0.038787 | 0.002753 | 0.026297 | 0.032762 | 0.026985 | 0.034464 |
| Gender_Male | 0.050461 | -0.022015 | -0.014682 | 0.035652 | 0.015760 | -0.032017 | 0.028956 | 0.006770 | -0.024143 | -0.013146 | 0.022992 | 0.003886 | -1.000000 | 1.000000 | 0.006823 | 0.036082 | 0.067793 | -0.033880 | -0.065197 | -0.006121 | 0.009745 | -0.005348 | -0.028877 | 0.046076 | -0.007804 | -0.032752 | NaN | 0.041924 | -0.041924 | 0.029453 | NaN | NaN | 0.003885 | -0.016547 | NaN | NaN | 0.000508 | NaN | 0.017960 | -0.039403 | 0.033252 | NaN | NaN | -0.039147 | NaN | -0.013859 | 0.022868 | NaN | 0.012716 | -0.040533 | -0.038787 | -0.002753 | -0.026297 | -0.032762 | -0.026985 | -0.034464 |
| JobRole_Healthcare Representative | 0.012878 | 0.008029 | -0.015503 | -0.066186 | 0.227637 | -0.206425 | -0.042785 | 0.029084 | -0.108929 | 0.034165 | 0.017609 | 0.018681 | -0.006823 | 0.006823 | 1.000000 | -0.059898 | -0.144652 | -0.085409 | -0.103472 | -0.075038 | -0.155727 | -0.166971 | -0.076515 | 0.027897 | 0.004913 | -0.030126 | NaN | 0.000382 | -0.000382 | -0.078696 | NaN | NaN | 0.024199 | 0.024270 | NaN | NaN | 0.014090 | NaN | 0.001272 | 0.115704 | 0.016367 | NaN | NaN | 0.026955 | NaN | -0.000928 | -0.005090 | NaN | 0.014021 | 0.102770 | -0.012432 | -0.026101 | 0.080206 | 0.067537 | 0.075902 | 0.043270 |
| JobRole_Human Resources | -0.015890 | 0.001896 | 0.008962 | 0.904983 | -0.263128 | -0.126381 | 0.549751 | -0.063119 | -0.066690 | -0.042895 | 0.001594 | -0.008623 | -0.036082 | 0.036082 | -0.059898 | 1.000000 | -0.088561 | -0.052290 | -0.063349 | -0.045941 | -0.095342 | -0.102226 | -0.046845 | 0.021541 | 0.030995 | -0.052320 | NaN | 0.014026 | -0.014026 | 0.036215 | NaN | NaN | -0.026719 | -0.005295 | NaN | NaN | -0.022014 | NaN | -0.004952 | -0.100922 | -0.029681 | NaN | NaN | 0.020578 | NaN | -0.010154 | 0.044169 | NaN | -0.009864 | -0.032191 | -0.035902 | 0.043887 | -0.020259 | -0.053260 | -0.054603 | -0.047546 |
| JobRole_Laboratory Technician | 0.009270 | 0.010023 | -0.014815 | -0.097859 | 0.336570 | -0.305208 | -0.063260 | 0.044359 | -0.161055 | 0.066262 | 0.058759 | -0.026589 | -0.067793 | 0.067793 | -0.144652 | -0.088561 | 1.000000 | -0.126280 | -0.152987 | -0.110947 | -0.230248 | -0.246873 | -0.113130 | -0.011224 | -0.009233 | 0.019873 | NaN | 0.044774 | -0.044774 | 0.098290 | NaN | NaN | 0.014273 | -0.063566 | NaN | NaN | -0.001533 | NaN | -0.022724 | -0.344608 | -0.015710 | NaN | NaN | -0.021121 | NaN | 0.010796 | -0.010691 | NaN | 0.013386 | -0.071452 | 0.053998 | -0.028209 | -0.125688 | -0.119437 | -0.110099 | -0.117188 |
| JobRole_Manager | 0.014078 | -0.042583 | 0.027294 | 0.087615 | -0.071356 | 0.035248 | 0.082271 | -0.011143 | 0.025577 | -0.001128 | -0.008046 | -0.038946 | 0.033880 | -0.033880 | -0.085409 | -0.052290 | -0.126280 | 1.000000 | -0.090330 | -0.065508 | -0.135949 | -0.145765 | -0.066797 | 0.001997 | 0.049982 | -0.055176 | NaN | 0.011086 | -0.011086 | -0.083316 | NaN | NaN | -0.066843 | 0.028453 | NaN | NaN | 0.010730 | NaN | 0.017112 | 0.552744 | -0.005620 | NaN | NaN | 0.042125 | NaN | 0.032050 | 0.025638 | NaN | -0.015637 | 0.160364 | 0.003052 | 0.005137 | 0.076676 | 0.137535 | 0.224255 | 0.146055 |
| JobRole_Manufacturing Director | -0.013536 | 0.009783 | 0.000598 | -0.070000 | 0.240754 | -0.218320 | -0.045251 | 0.052023 | -0.115206 | 0.035496 | -0.010820 | 0.007817 | 0.065197 | -0.065197 | -0.103472 | -0.063349 | -0.152987 | -0.090330 | 1.000000 | -0.079362 | -0.164700 | -0.176592 | -0.080924 | 0.020543 | 0.002819 | -0.021331 | NaN | 0.010302 | -0.010302 | -0.082994 | NaN | NaN | -0.019004 | -0.005290 | NaN | NaN | 0.059178 | NaN | -0.021939 | 0.114896 | -0.013747 | NaN | NaN | 0.009580 | NaN | 0.029775 | 0.003640 | NaN | 0.007735 | 0.033861 | -0.013987 | 0.002011 | 0.087510 | 0.075061 | -0.007241 | 0.084649 |
| JobRole_Research Director | -0.021431 | -0.023579 | 0.034600 | -0.050765 | 0.174596 | -0.158327 | -0.032816 | 0.018401 | -0.083548 | 0.062898 | -0.006044 | -0.022905 | 0.006121 | -0.006121 | -0.075038 | -0.045941 | -0.110947 | -0.065508 | -0.079362 | 1.000000 | -0.119442 | -0.128066 | -0.058687 | 0.037524 | 0.008271 | -0.042299 | NaN | -0.002400 | 0.002400 | -0.088870 | NaN | NaN | -0.003730 | 0.049694 | NaN | NaN | -0.048689 | NaN | 0.015200 | 0.414319 | -0.006217 | NaN | NaN | 0.097925 | NaN | -0.035744 | -0.005492 | NaN | 0.015807 | 0.140892 | -0.004527 | 0.034403 | 0.078271 | 0.075569 | 0.074455 | 0.116442 |
| JobRole_Research Scientist | -0.010116 | -0.004461 | 0.010588 | -0.105352 | 0.362340 | -0.328576 | -0.068103 | 0.043729 | -0.173387 | 0.039735 | 0.005286 | 0.076218 | -0.009745 | 0.009745 | -0.155727 | -0.095342 | -0.230248 | -0.135949 | -0.164700 | -0.119442 | 1.000000 | -0.265775 | -0.121792 | -0.012115 | -0.039987 | 0.053522 | NaN | -0.054378 | 0.054378 | -0.000360 | NaN | NaN | -0.021396 | 0.000709 | NaN | NaN | 0.001940 | NaN | 0.047604 | -0.387788 | 0.020503 | NaN | NaN | -0.043981 | NaN | 0.019416 | -0.003116 | NaN | -0.011635 | -0.169943 | -0.052126 | -0.058613 | -0.115012 | -0.123154 | -0.105237 | -0.124838 |
| JobRole_Sales Executive | 0.031022 | -0.010175 | -0.011920 | -0.112959 | -0.733497 | 0.808869 | -0.073020 | -0.091122 | 0.457308 | -0.133532 | -0.036995 | -0.058843 | 0.005348 | -0.005348 | -0.166971 | -0.102226 | -0.246873 | -0.145765 | -0.176592 | -0.128066 | -0.265775 | 1.000000 | -0.130586 | -0.013853 | 0.005751 | 0.006210 | NaN | -0.006341 | 0.006341 | 0.019774 | NaN | NaN | 0.055332 | 0.053398 | NaN | NaN | -0.024421 | NaN | -0.011413 | 0.127490 | 0.012604 | NaN | NaN | 0.005913 | NaN | -0.041401 | -0.004836 | NaN | 0.015756 | -0.050006 | 0.013241 | 0.032092 | 0.133917 | 0.109504 | 0.049202 | 0.099893 |
| JobRole_Sales Representative | -0.033780 | 0.055469 | -0.025257 | -0.051764 | -0.336127 | 0.370667 | -0.033462 | -0.043208 | 0.133065 | -0.051990 | -0.033774 | 0.057185 | 0.028877 | -0.028877 | -0.076515 | -0.046845 | -0.113130 | -0.066797 | -0.080924 | -0.058687 | -0.121792 | -0.130586 | 1.000000 | -0.052890 | -0.023659 | 0.072439 | NaN | -0.003347 | 0.003347 | 0.157234 | NaN | NaN | 0.007154 | -0.091465 | NaN | NaN | 0.002949 | NaN | -0.027282 | -0.216559 | 0.001413 | NaN | NaN | -0.104494 | NaN | -0.006214 | -0.024859 | NaN | -0.048067 | 0.041842 | 0.040377 | 0.045148 | -0.191950 | -0.150470 | -0.085622 | -0.170527 |
| MaritalStatus_Divorced | 0.057455 | 0.005779 | -0.043287 | 0.016037 | 0.035158 | -0.043451 | 0.012107 | -0.002672 | -0.007212 | 0.013316 | 0.005411 | -0.019243 | -0.046076 | 0.046076 | 0.027897 | 0.021541 | -0.011224 | 0.001997 | 0.020543 | 0.037524 | -0.012115 | -0.013853 | -0.052890 | 1.000000 | -0.491506 | -0.366691 | NaN | -0.023462 | 0.023462 | -0.087716 | NaN | NaN | 0.025673 | -0.002439 | NaN | NaN | 0.016439 | NaN | 0.016815 | 0.037087 | -0.015197 | NaN | NaN | 0.040824 | NaN | -0.010310 | 0.006199 | NaN | 0.446285 | 0.014843 | 0.008405 | -0.009080 | 0.011309 | 0.023047 | -0.005279 | 0.015815 |
| MaritalStatus_Married | -0.043635 | -0.030785 | 0.055613 | 0.034767 | -0.019997 | 0.005378 | 0.057339 | -0.017866 | 0.018491 | -0.007139 | -0.009171 | 0.002710 | 0.007804 | -0.007804 | 0.004913 | 0.030995 | -0.009233 | 0.049982 | 0.002819 | 0.008271 | -0.039987 | 0.005751 | -0.023659 | -0.491506 | 1.000000 | -0.629981 | NaN | 0.013502 | -0.013502 | -0.090984 | NaN | NaN | 0.002933 | -0.001865 | NaN | NaN | -0.022180 | NaN | 0.028324 | 0.050547 | -0.010315 | NaN | NaN | -0.016142 | NaN | 0.009585 | -0.043382 | NaN | 0.225574 | -0.005217 | -0.029602 | -0.006388 | 0.066529 | 0.055687 | 0.054102 | 0.036885 |
| MaritalStatus_Single | -0.004622 | 0.027734 | -0.020808 | -0.051443 | -0.009990 | 0.033002 | -0.072051 | 0.021469 | -0.013323 | -0.004249 | 0.004972 | 0.014265 | 0.032752 | -0.032752 | -0.030126 | -0.052320 | 0.019873 | -0.055176 | -0.021331 | -0.042299 | 0.053522 | 0.006210 | 0.072439 | -0.366691 | -0.629981 | 1.000000 | NaN | 0.006498 | -0.006498 | 0.175419 | NaN | NaN | -0.026027 | 0.004168 | NaN | NaN | 0.009035 | NaN | -0.045253 | -0.087072 | 0.024571 | NaN | NaN | -0.019161 | NaN | -0.001045 | 0.040817 | NaN | -0.638957 | -0.007663 | 0.024129 | 0.014921 | -0.081157 | -0.080043 | -0.053090 | -0.053507 |
| Over18_Y | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| OverTime_No | 0.037163 | -0.029392 | 0.000539 | 0.006178 | 0.003036 | -0.005864 | -0.004040 | 0.013787 | -0.014607 | -0.002246 | -0.024970 | 0.017723 | -0.041924 | 0.041924 | 0.000382 | 0.014026 | 0.044774 | 0.011086 | 0.010302 | -0.002400 | -0.054378 | -0.006341 | -0.003347 | -0.023462 | 0.013502 | 0.006498 | NaN | 1.000000 | -1.000000 | -0.246118 | NaN | NaN | -0.038315 | 0.020322 | NaN | NaN | -0.070132 | NaN | 0.003507 | -0.000544 | -0.024539 | NaN | NaN | 0.020786 | NaN | -0.004369 | -0.048493 | NaN | 0.000449 | -0.003604 | 0.079113 | 0.027092 | 0.048276 | 0.010861 | 0.012239 | 0.033366 |
| OverTime_Yes | -0.037163 | 0.029392 | -0.000539 | -0.006178 | -0.003036 | 0.005864 | 0.004040 | -0.013787 | 0.014607 | 0.002246 | 0.024970 | -0.017723 | 0.041924 | -0.041924 | -0.000382 | -0.014026 | -0.044774 | -0.011086 | -0.010302 | 0.002400 | 0.054378 | 0.006341 | 0.003347 | 0.023462 | -0.013502 | -0.006498 | NaN | -1.000000 | 1.000000 | 0.246118 | NaN | NaN | 0.038315 | -0.020322 | NaN | NaN | 0.070132 | NaN | -0.003507 | 0.000544 | 0.024539 | NaN | NaN | -0.020786 | NaN | 0.004369 | 0.048493 | NaN | -0.000449 | 0.003604 | -0.079113 | -0.027092 | -0.048276 | -0.010861 | -0.012239 | -0.033366 |
| Attrition | -0.074457 | 0.115143 | -0.049538 | 0.016832 | -0.085293 | 0.080855 | 0.036466 | -0.032703 | 0.055781 | -0.046999 | -0.017898 | 0.069355 | -0.029453 | 0.029453 | -0.078696 | 0.036215 | 0.098290 | -0.083316 | -0.082994 | -0.088870 | -0.000360 | 0.019774 | 0.157234 | -0.087716 | -0.090984 | 0.175419 | NaN | -0.246118 | 0.246118 | 1.000000 | NaN | NaN | 0.073971 | -0.031373 | NaN | NaN | -0.103369 | NaN | -0.130016 | -0.169105 | -0.103481 | NaN | NaN | 0.043494 | NaN | 0.002889 | -0.045872 | NaN | -0.137145 | -0.033917 | -0.059478 | -0.063939 | -0.182422 | -0.164386 | -0.033019 | -0.150640 |
| Age | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| DailyRate | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| DistanceFromHome | 0.008097 | -0.022222 | 0.013743 | -0.019777 | -0.026238 | 0.035867 | -0.005234 | -0.045394 | 0.054723 | 0.021864 | 0.027156 | -0.036177 | -0.003885 | 0.003885 | 0.024199 | -0.026719 | 0.014273 | -0.066843 | -0.019004 | -0.003730 | -0.021396 | 0.055332 | 0.007154 | 0.025673 | 0.002933 | -0.026027 | NaN | -0.038315 | 0.038315 | 0.073971 | NaN | NaN | 1.000000 | 0.030019 | NaN | NaN | -0.001822 | NaN | 0.048667 | -0.014518 | -0.028497 | NaN | NaN | -0.011595 | NaN | -0.000746 | 0.005234 | NaN | 0.021909 | -0.003448 | -0.006813 | -0.023779 | 0.014902 | 0.005106 | -0.013109 | 0.002100 |
| Education | 0.004524 | -0.008292 | 0.004126 | 0.011435 | -0.018604 | 0.014215 | 0.026479 | 0.013184 | 0.072405 | -0.072335 | 0.038043 | -0.026742 | 0.016547 | -0.016547 | 0.024270 | -0.005295 | -0.063566 | 0.028453 | -0.005290 | 0.049694 | 0.000709 | 0.053398 | -0.091465 | -0.002439 | -0.001865 | 0.004168 | NaN | 0.020322 | -0.020322 | -0.031373 | NaN | NaN | 0.030019 | 1.000000 | NaN | NaN | -0.027128 | NaN | 0.042438 | 0.101589 | -0.011296 | NaN | NaN | 0.126317 | NaN | -0.024539 | -0.009118 | NaN | 0.018422 | 0.062663 | -0.025100 | 0.009819 | 0.047600 | 0.056703 | 0.054254 | 0.055449 |
| EmployeeCount | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| EmployeeNumber | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| EnvironmentSatisfaction | 0.003568 | -0.012624 | 0.008496 | -0.007597 | 0.027976 | -0.025606 | -0.006898 | -0.024526 | 0.000479 | -0.021299 | 0.064602 | 0.027713 | -0.000508 | 0.000508 | 0.014090 | -0.022014 | -0.001533 | 0.010730 | 0.059178 | -0.048689 | 0.001940 | -0.024421 | 0.002949 | 0.016439 | -0.022180 | 0.009035 | NaN | -0.070132 | 0.070132 | -0.103369 | NaN | NaN | -0.001822 | -0.027128 | NaN | NaN | 1.000000 | NaN | -0.008278 | 0.001212 | -0.006784 | NaN | NaN | 0.012594 | NaN | -0.029548 | 0.007665 | NaN | 0.003432 | 0.024419 | -0.019359 | 0.027627 | 0.020692 | 0.040470 | 0.016194 | -0.007923 |
| HourlyRate | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| JobInvolvement | -0.045779 | 0.004424 | 0.026714 | 0.004789 | 0.023187 | -0.026107 | 0.002079 | 0.003228 | -0.018657 | 0.017103 | -0.011895 | -0.004519 | -0.017960 | 0.017960 | 0.001272 | -0.004952 | -0.022724 | 0.017112 | -0.021939 | 0.015200 | 0.047604 | -0.011413 | -0.027282 | 0.016815 | 0.028324 | -0.045253 | NaN | 0.003507 | -0.003507 | -0.130016 | NaN | NaN | 0.048667 | 0.042438 | NaN | NaN | -0.008278 | NaN | 1.000000 | -0.012630 | -0.021476 | NaN | NaN | 0.015012 | NaN | -0.029071 | 0.034297 | NaN | 0.021523 | -0.017304 | -0.015338 | -0.014617 | 0.033586 | 0.004117 | -0.024184 | 0.025310 |
| JobLevel | -0.007295 | -0.021557 | 0.023433 | -0.006157 | -0.107830 | 0.114307 | 0.010409 | -0.008431 | 0.092698 | -0.014114 | -0.016724 | -0.054707 | 0.039403 | -0.039403 | 0.115704 | -0.100922 | -0.344608 | 0.552744 | 0.114896 | 0.414319 | -0.387788 | 0.127490 | -0.216559 | 0.037087 | 0.050547 | -0.087072 | NaN | -0.000544 | 0.000544 | -0.169105 | NaN | NaN | -0.014518 | 0.101589 | NaN | NaN | 0.001212 | NaN | -0.012630 | 1.000000 | -0.001944 | NaN | NaN | 0.142501 | NaN | -0.021222 | 0.021642 | NaN | 0.013984 | 0.356467 | -0.018191 | 0.037818 | 0.306263 | 0.332018 | 0.353885 | 0.361762 |
| JobSatisfaction | 0.019802 | 0.027117 | -0.036562 | -0.024068 | -0.002798 | 0.013499 | -0.021467 | 0.052004 | -0.023528 | -0.022645 | 0.003380 | -0.019795 | -0.033252 | 0.033252 | 0.016367 | -0.029681 | -0.015710 | -0.005620 | -0.013747 | -0.006217 | 0.020503 | 0.012604 | 0.001413 | -0.015197 | -0.010315 | 0.024571 | NaN | -0.024539 | 0.024539 | -0.103481 | NaN | NaN | -0.028497 | -0.011296 | NaN | NaN | -0.006784 | NaN | -0.021476 | -0.001944 | 1.000000 | NaN | NaN | -0.055699 | NaN | 0.002297 | -0.012454 | NaN | 0.010690 | 0.002016 | -0.005779 | -0.019459 | 0.018744 | 0.007938 | -0.018214 | -0.019899 |
| MonthlyIncome | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| MonthlyRate | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| NumCompaniesWorked | 0.002718 | -0.039718 | 0.032401 | 0.020618 | 0.022237 | -0.032097 | 0.031007 | -0.006131 | -0.018611 | 0.024826 | -0.012870 | -0.013819 | 0.039147 | -0.039147 | 0.026955 | 0.020578 | -0.021121 | 0.042125 | 0.009580 | 0.097925 | -0.043981 | 0.005913 | -0.104494 | 0.040824 | -0.016142 | -0.019161 | NaN | 0.020786 | -0.020786 | 0.043494 | NaN | NaN | -0.011595 | 0.126317 | NaN | NaN | 0.012594 | NaN | 0.015012 | 0.142501 | -0.055699 | NaN | NaN | 1.000000 | NaN | -0.014095 | 0.052733 | NaN | 0.030075 | 0.042451 | -0.066054 | -0.008366 | -0.142141 | -0.111122 | -0.036814 | -0.120770 |
| PercentSalaryHike | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| PerformanceRating | 0.018310 | 0.016463 | -0.026390 | -0.006385 | 0.032720 | -0.031050 | -0.016167 | 0.010853 | -0.020918 | 0.014868 | 0.011449 | -0.021729 | 0.013859 | -0.013859 | -0.000928 | -0.010154 | 0.010796 | 0.032050 | 0.029775 | -0.035744 | 0.019416 | -0.041401 | -0.006214 | -0.010310 | 0.009585 | -0.001045 | NaN | -0.004369 | 0.004369 | 0.002889 | NaN | NaN | -0.000746 | -0.024539 | NaN | NaN | -0.029548 | NaN | -0.029071 | -0.021222 | 0.002297 | NaN | NaN | -0.014095 | NaN | 1.000000 | -0.031351 | NaN | 0.003506 | 0.011048 | -0.015579 | 0.002572 | 0.029423 | 0.034031 | 0.017896 | 0.025381 |
| RelationshipSatisfaction | 0.021132 | 0.028500 | -0.038640 | 0.034583 | -0.004587 | -0.010489 | 0.041105 | -0.019973 | -0.006580 | 0.030494 | -0.020305 | -0.011044 | -0.022868 | 0.022868 | -0.005090 | 0.044169 | -0.010691 | 0.025638 | 0.003640 | -0.005492 | -0.003116 | -0.004836 | -0.024859 | 0.006199 | -0.043382 | 0.040817 | NaN | -0.048493 | 0.048493 | -0.045872 | NaN | NaN | 0.005234 | -0.009118 | NaN | NaN | 0.007665 | NaN | 0.034297 | 0.021642 | -0.012454 | NaN | NaN | 0.052733 | NaN | -0.031351 | 1.000000 | NaN | -0.045952 | -0.004342 | 0.002497 | 0.019604 | -0.021786 | -0.021509 | 0.033493 | -0.005533 |
| StandardHours | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| StockOptionLevel | 0.028807 | -0.016142 | -0.005303 | -0.004000 | 0.016927 | -0.015755 | 0.021206 | -0.017993 | 0.022560 | 0.033750 | -0.042100 | -0.024560 | -0.012716 | 0.012716 | 0.014021 | -0.009864 | 0.013386 | -0.015637 | 0.007735 | 0.015807 | -0.011635 | 0.015756 | -0.048067 | 0.446285 | 0.225574 | -0.638957 | NaN | 0.000449 | -0.000449 | -0.137145 | NaN | NaN | 0.021909 | 0.018422 | NaN | NaN | 0.003432 | NaN | 0.021523 | 0.013984 | 0.010690 | NaN | NaN | 0.030075 | NaN | 0.003506 | -0.045952 | NaN | 1.000000 | 0.016672 | 0.011274 | 0.004129 | 0.057714 | 0.062072 | 0.014352 | 0.037675 |
| TotalWorkingYears | 0.018476 | 0.006153 | -0.017619 | -0.007662 | -0.004442 | 0.007973 | 0.004348 | 0.047855 | 0.029466 | -0.050973 | -0.039133 | -0.002168 | 0.040533 | -0.040533 | 0.102770 | -0.032191 | -0.071452 | 0.160364 | 0.033861 | 0.140892 | -0.169943 | -0.050006 | 0.041842 | 0.014843 | -0.005217 | -0.007663 | NaN | -0.003604 | 0.003604 | -0.033917 | NaN | NaN | -0.003448 | 0.062663 | NaN | NaN | 0.024419 | NaN | -0.017304 | 0.356467 | 0.002016 | NaN | NaN | 0.042451 | NaN | 0.011048 | -0.004342 | NaN | 0.016672 | 1.000000 | 0.009694 | 0.003894 | 0.198983 | 0.228646 | 0.216931 | 0.244631 |
| TrainingTimesLastYear | -0.020746 | 0.006193 | 0.008498 | -0.040022 | -0.006819 | 0.024688 | -0.037664 | -0.039018 | -0.029046 | 0.070542 | -0.008151 | 0.008289 | 0.038787 | -0.038787 | -0.012432 | -0.035902 | 0.053998 | 0.003052 | -0.013987 | -0.004527 | -0.052126 | 0.013241 | 0.040377 | 0.008405 | -0.029602 | 0.024129 | NaN | 0.079113 | -0.079113 | -0.059478 | NaN | NaN | -0.006813 | -0.025100 | NaN | NaN | -0.019359 | NaN | -0.015338 | -0.018191 | -0.005779 | NaN | NaN | -0.066054 | NaN | -0.015579 | 0.002497 | NaN | 0.011274 | 0.009694 | 1.000000 | 0.028072 | 0.000134 | -0.001871 | -0.002067 | -0.003578 |
| WorkLifeBalance | 0.005780 | 0.010199 | -0.012640 | 0.047763 | -0.069922 | 0.051320 | -0.003967 | -0.039728 | 0.018500 | 0.001641 | 0.031812 | 0.021962 | 0.002753 | -0.002753 | -0.026101 | 0.043887 | -0.028209 | 0.005137 | 0.002011 | 0.034403 | -0.058613 | 0.032092 | 0.045148 | -0.009080 | -0.006388 | 0.014921 | NaN | 0.027092 | -0.027092 | -0.063939 | NaN | NaN | -0.023779 | 0.009819 | NaN | NaN | 0.027627 | NaN | -0.014617 | 0.037818 | -0.019459 | NaN | NaN | -0.008366 | NaN | 0.002572 | 0.019604 | NaN | 0.004129 | 0.003894 | 0.028072 | 1.000000 | 0.006952 | 0.036562 | 0.008941 | -0.005749 |
| YearsAtCompany | 0.023331 | 0.021100 | -0.033732 | -0.005295 | -0.036307 | 0.039907 | -0.008196 | -0.013111 | 0.016770 | 0.009381 | -0.022235 | 0.010803 | 0.026297 | -0.026297 | 0.080206 | -0.020259 | -0.125688 | 0.076676 | 0.087510 | 0.078271 | -0.115012 | 0.133917 | -0.191950 | 0.011309 | 0.066529 | -0.081157 | NaN | 0.048276 | -0.048276 | -0.182422 | NaN | NaN | 0.014902 | 0.047600 | NaN | NaN | 0.020692 | NaN | 0.033586 | 0.306263 | 0.018744 | NaN | NaN | -0.142141 | NaN | 0.029423 | -0.021786 | NaN | 0.057714 | 0.198983 | 0.000134 | 0.006952 | 1.000000 | 0.771709 | 0.421157 | 0.745037 |
| YearsInCurrentRole | 0.014365 | 0.013334 | -0.021064 | -0.033121 | -0.037461 | 0.053360 | -0.020759 | 0.004035 | 0.037816 | -0.020740 | -0.006238 | 0.000445 | 0.032762 | -0.032762 | 0.067537 | -0.053260 | -0.119437 | 0.137535 | 0.075061 | 0.075569 | -0.123154 | 0.109504 | -0.150470 | 0.023047 | 0.055687 | -0.080043 | NaN | 0.010861 | -0.010861 | -0.164386 | NaN | NaN | 0.005106 | 0.056703 | NaN | NaN | 0.040470 | NaN | 0.004117 | 0.332018 | 0.007938 | NaN | NaN | -0.111122 | NaN | 0.034031 | -0.021509 | NaN | 0.062072 | 0.228646 | -0.001871 | 0.036562 | 0.771709 | 1.000000 | 0.501135 | 0.705200 |
| YearsSinceLastPromotion | 0.020815 | 0.023216 | -0.033877 | -0.026931 | -0.021497 | 0.034112 | -0.023700 | -0.002480 | 0.006219 | 0.022665 | -0.039931 | 0.003853 | 0.026985 | -0.026985 | 0.075902 | -0.054603 | -0.110099 | 0.224255 | -0.007241 | 0.074455 | -0.105237 | 0.049202 | -0.085622 | -0.005279 | 0.054102 | -0.053090 | NaN | 0.012239 | -0.012239 | -0.033019 | NaN | NaN | -0.013109 | 0.054254 | NaN | NaN | 0.016194 | NaN | -0.024184 | 0.353885 | -0.018214 | NaN | NaN | -0.036814 | NaN | 0.017896 | 0.033493 | NaN | 0.014352 | 0.216931 | -0.002067 | 0.008941 | 0.421157 | 0.501135 | 1.000000 | 0.479890 |
| YearsWithCurrManager | 0.029183 | 0.029774 | -0.045106 | -0.021503 | -0.024626 | 0.034959 | -0.025943 | 0.003488 | 0.022774 | -0.005065 | -0.015490 | 0.002107 | 0.034464 | -0.034464 | 0.043270 | -0.047546 | -0.117188 | 0.146055 | 0.084649 | 0.116442 | -0.124838 | 0.099893 | -0.170527 | 0.015815 | 0.036885 | -0.053507 | NaN | 0.033366 | -0.033366 | -0.150640 | NaN | NaN | 0.002100 | 0.055449 | NaN | NaN | -0.007923 | NaN | 0.025310 | 0.361762 | -0.019899 | NaN | NaN | -0.120770 | NaN | 0.025381 | -0.005533 | NaN | 0.037675 | 0.244631 | -0.003578 | -0.005749 | 0.745037 | 0.705200 | 0.479890 | 1.000000 |
df_train_merge4.describe()
| BusinessTravel_Non-Travel | BusinessTravel_Travel_Frequently | BusinessTravel_Travel_Rarely | Department_Human Resources | Department_Research & Development | Department_Sales | EducationField_Human Resources | EducationField_Life Sciences | EducationField_Marketing | EducationField_Medical | EducationField_Other | EducationField_Technical Degree | Gender_Female | Gender_Male | JobRole_Healthcare Representative | JobRole_Human Resources | JobRole_Laboratory Technician | JobRole_Manager | JobRole_Manufacturing Director | JobRole_Research Director | JobRole_Research Scientist | JobRole_Sales Executive | JobRole_Sales Representative | MaritalStatus_Divorced | MaritalStatus_Married | MaritalStatus_Single | Over18_Y | OverTime_No | OverTime_Yes | Attrition | Age | DailyRate | DistanceFromHome | Education | EmployeeCount | EmployeeNumber | EnvironmentSatisfaction | HourlyRate | JobInvolvement | JobLevel | JobSatisfaction | MonthlyIncome | MonthlyRate | NumCompaniesWorked | PercentSalaryHike | PerformanceRating | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.0 | 1470.000000 | 1470.000000 | 1.470000e+03 | 1.470000e+03 | 1470.0 | 1.470000e+03 | 1.470000e+03 | 1470.0 | 1.470000e+03 | 1.470000e+03 | 1.470000e+03 | 1.470000e+03 | 1.470000e+03 | 1.470000e+03 | 1470.0 | 1.470000e+03 | 1.470000e+03 | 1.470000e+03 | 1.470000e+03 | 1.470000e+03 | 1470.0 | 1.470000e+03 | 1.470000e+03 | 1.470000e+03 | 1.470000e+03 | 1.470000e+03 | 1.470000e+03 | 1.470000e+03 | 1.470000e+03 |
| mean | 0.102041 | 0.188435 | 0.709524 | 0.042857 | 0.653741 | 0.303401 | 0.018367 | 0.412245 | 0.108163 | 0.315646 | 0.055782 | 0.089796 | 0.400000 | 0.600000 | 0.089116 | 0.035374 | 0.176190 | 0.069388 | 0.098639 | 0.054422 | 0.198639 | 0.221769 | 0.056463 | 0.222449 | 0.457823 | 0.319728 | 1.0 | 0.717007 | 0.282993 | -7.106938e-17 | 7.105427e-15 | 0.0 | -1.970457e-16 | 2.697011e-16 | 0.0 | 4.547474e-13 | 7.054070e-17 | -2.842171e-14 | -6.495182e-18 | 1.479542e-16 | -7.816876e-18 | 0.0 | 1.818989e-12 | -5.588878e-17 | 3.552714e-15 | -7.766274e-16 | -1.910792e-16 | 0.0 | 6.600918e-17 | 1.253721e-17 | -1.106069e-16 | -5.165936e-17 | -1.540340e-16 | -8.851574e-17 | -4.236974e-17 | -6.993650e-17 |
| std | 0.302805 | 0.391193 | 0.454137 | 0.202604 | 0.475939 | 0.459884 | 0.134321 | 0.492406 | 0.310692 | 0.464931 | 0.229579 | 0.285986 | 0.490065 | 0.490065 | 0.285008 | 0.184786 | 0.381112 | 0.254199 | 0.298279 | 0.226925 | 0.399112 | 0.415578 | 0.230891 | 0.416033 | 0.498387 | 0.466530 | 0.0 | 0.450606 | 0.450606 | 1.000340e+00 | 0.000000e+00 | 0.0 | 1.000340e+00 | 1.000340e+00 | 0.0 | 0.000000e+00 | 1.000340e+00 | 0.000000e+00 | 1.000340e+00 | 1.000340e+00 | 1.000340e+00 | 0.0 | 0.000000e+00 | 1.000340e+00 | 0.000000e+00 | 1.000340e+00 | 1.000340e+00 | 0.0 | 1.000340e+00 | 1.000340e+00 | 1.000340e+00 | 1.000340e+00 | 1.000340e+00 | 1.000340e+00 | 1.000340e+00 | 1.000340e+00 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.0 | 0.000000 | 0.000000 | -4.384223e-01 | 7.105427e-15 | 0.0 | -1.412470e+00 | -1.868426e+00 | 0.0 | 4.547474e-13 | -1.575686e+00 | -2.842171e-14 | -2.432006e+00 | -9.614864e-01 | -1.567907e+00 | 0.0 | 1.818989e-12 | -1.078504e+00 | 3.552714e-15 | -4.262300e-01 | -1.584178e+00 | 0.0 | -9.320144e-01 | -2.270509e+00 | -2.171982e+00 | -2.493820e+00 | -1.700445e+00 | -1.219103e+00 | -6.791457e-01 | -1.191138e+00 |
| 25% | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.0 | 0.000000 | 0.000000 | -4.384223e-01 | 7.105427e-15 | 0.0 | -1.137945e+00 | -8.916883e-01 | 0.0 | 4.547474e-13 | -6.605307e-01 | -2.842171e-14 | -1.026167e+00 | -9.614864e-01 | -6.608532e-01 | 0.0 | 1.818989e-12 | -6.780494e-01 | 3.552714e-15 | -4.262300e-01 | -6.589728e-01 | 0.0 | -9.320144e-01 | -9.052321e-01 | -6.201892e-01 | -1.077862e+00 | -7.581791e-01 | -6.187220e-01 | -6.791457e-01 | -5.968549e-01 |
| 50% | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.0 | 1.000000 | 0.000000 | -4.384223e-01 | 7.105427e-15 | 0.0 | 2.346832e-01 | 8.504925e-02 | 0.0 | 4.547474e-13 | 2.546249e-01 | -2.842171e-14 | 3.796721e-01 | -5.778755e-02 | 2.462002e-01 | 0.0 | 1.818989e-12 | -2.775943e-01 | 3.552714e-15 | -4.262300e-01 | 2.662326e-01 | 0.0 | 2.419883e-01 | 5.872853e-01 | 1.557071e-01 | 3.380962e-01 | -1.300020e-01 | -3.185315e-01 | -3.687153e-01 | -2.997134e-01 |
| 75% | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 1.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.0 | 1.000000 | 1.000000 | -4.384223e-01 | 7.105427e-15 | 0.0 | 8.365853e-01 | 1.061787e+00 | 0.0 | 4.547474e-13 | 1.169781e+00 | -2.842171e-14 | 3.796721e-01 | 8.459113e-01 | 1.153254e+00 | 0.0 | 1.818989e-12 | 5.233157e-01 | 3.552714e-15 | -4.262300e-01 | 1.191438e+00 | 0.0 | 2.419883e-01 | 5.872853e-01 | 1.557071e-01 | 3.380962e-01 | 5.007390e-01 | 8.822302e-01 | 2.521455e-01 | 8.888524e-01 |
| max | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.0 | 1.000000 | 1.000000 | 2.280906e+00 | 7.105427e-15 | 0.0 | 2.156362e+00 | 2.038524e+00 | 0.0 | 4.547474e-13 | 1.169781e+00 | -2.842171e-14 | 1.785511e+00 | 2.653309e+00 | 1.153254e+00 | 0.0 | 1.818989e-12 | 2.525591e+00 | 3.552714e-15 | 2.346151e+00 | 1.191438e+00 | 0.0 | 2.589994e+00 | 5.872853e-01 | 2.483396e+00 | 1.754054e+00 | 2.382706e+00 | 2.983563e+00 | 3.977310e+00 | 2.968843e+00 |
# 5. Visualize all the distribution relationship
# 1. Seperate categorical columns
cat = []
con = []
for i in df_attr.columns:
if (df_attr[i].dtypes == 'object'):
cat.append(i)
else:
con.append(i)
df_train5_cat1 = cat
df_train5_cat1 = df_attr[cat]
df_train5_cat1
| BusinessTravel | Department | EducationField | Gender | JobRole | MaritalStatus | Over18 | OverTime | AGE_GROUP | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Travel_Rarely | Sales | Life Sciences | Female | Sales Executive | Single | Y | Yes | GROUP2 |
| 1 | Travel_Frequently | Research & Development | Life Sciences | Male | Research Scientist | Married | Y | No | GROUP3 |
| 2 | Travel_Rarely | Research & Development | Other | Male | Laboratory Technician | Single | Y | Yes | GROUP2 |
| 3 | Travel_Frequently | Research & Development | Life Sciences | Female | Research Scientist | Married | Y | Yes | GROUP2 |
| 4 | Travel_Rarely | Research & Development | Medical | Male | Laboratory Technician | Married | Y | No | GROUP1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1465 | Travel_Frequently | Research & Development | Medical | Male | Laboratory Technician | Married | Y | No | GROUP2 |
| 1466 | Travel_Rarely | Research & Development | Medical | Male | Healthcare Representative | Married | Y | No | GROUP2 |
| 1467 | Travel_Rarely | Research & Development | Life Sciences | Male | Manufacturing Director | Married | Y | Yes | GROUP1 |
| 1468 | Travel_Frequently | Sales | Medical | Male | Sales Executive | Married | Y | No | GROUP3 |
| 1469 | Travel_Rarely | Research & Development | Medical | Male | Laboratory Technician | Married | Y | No | GROUP2 |
1470 rows × 9 columns
df_train5_con1 = con
df_train5_con1 = df_attr[con]
df_train5_con1
| Attrition | Age | DailyRate | DistanceFromHome | Education | EmployeeCount | EmployeeNumber | EnvironmentSatisfaction | HourlyRate | JobInvolvement | JobLevel | JobSatisfaction | MonthlyIncome | MonthlyRate | NumCompaniesWorked | PercentSalaryHike | PerformanceRating | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 41 | 1102 | 1 | 2 | 1 | 1 | 2 | 94 | 3 | 2 | 4 | 5993 | 19479 | 8 | 11 | 3 | 1 | 80 | 0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 |
| 1 | 0 | 49 | 279 | 8 | 1 | 1 | 2 | 3 | 61 | 2 | 2 | 2 | 5130 | 24907 | 1 | 23 | 4 | 4 | 80 | 1 | 10 | 3 | 3 | 10 | 7 | 1 | 7 |
| 2 | 1 | 37 | 1373 | 2 | 2 | 1 | 4 | 4 | 92 | 2 | 1 | 3 | 2090 | 2396 | 6 | 15 | 3 | 2 | 80 | 0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 |
| 3 | 0 | 33 | 1392 | 3 | 4 | 1 | 5 | 4 | 56 | 3 | 1 | 3 | 2909 | 23159 | 1 | 11 | 3 | 3 | 80 | 0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 |
| 4 | 0 | 27 | 591 | 2 | 1 | 1 | 7 | 1 | 40 | 3 | 1 | 2 | 3468 | 16632 | 9 | 12 | 3 | 4 | 80 | 1 | 6 | 3 | 3 | 2 | 2 | 2 | 2 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1465 | 0 | 36 | 884 | 23 | 2 | 1 | 2061 | 3 | 41 | 4 | 2 | 4 | 2571 | 12290 | 4 | 17 | 3 | 3 | 80 | 1 | 17 | 3 | 3 | 5 | 2 | 0 | 3 |
| 1466 | 0 | 39 | 613 | 6 | 1 | 1 | 2062 | 4 | 42 | 2 | 3 | 1 | 9991 | 21457 | 4 | 15 | 3 | 1 | 80 | 1 | 9 | 5 | 3 | 7 | 7 | 1 | 7 |
| 1467 | 0 | 27 | 155 | 4 | 3 | 1 | 2064 | 2 | 87 | 4 | 2 | 2 | 6142 | 5174 | 1 | 20 | 4 | 2 | 80 | 1 | 6 | 0 | 3 | 6 | 2 | 0 | 3 |
| 1468 | 0 | 49 | 1023 | 2 | 3 | 1 | 2065 | 4 | 63 | 2 | 2 | 2 | 5390 | 13243 | 2 | 14 | 3 | 4 | 80 | 0 | 17 | 3 | 2 | 9 | 6 | 0 | 8 |
| 1469 | 0 | 34 | 628 | 8 | 3 | 1 | 2068 | 2 | 82 | 4 | 2 | 3 | 4404 | 10228 | 2 | 12 | 3 | 1 | 80 | 0 | 6 | 3 | 4 | 4 | 3 | 1 | 2 |
1470 rows × 27 columns
# Visulasation of categorical columns
plt.figure(figsize=(15,19))
for x1,i in enumerate(df_train5_cat1.columns):
if df_train5_cat1[i].dtypes=='object':
plt.subplot(3,3,x1+1)
sns.countplot(x=df_train5_cat1[i])
# Visulisation of continious columns
plt.figure(figsize=(15,19))
for x1,i in enumerate(df_train5_con1.columns):
if df_train5_con1[i].dtypes == 'int64' or df_train5_con1[i].dypes == 'float64':
plt.subplot(9,3,x1+1)
sns.distplot(df_train5_con1[i])
df_attr.skew().sort_values()
WorkLifeBalance -0.552480 JobInvolvement -0.498419 JobSatisfaction -0.329672 EnvironmentSatisfaction -0.321654 RelationshipSatisfaction -0.302828 Education -0.289681 HourlyRate -0.032311 DailyRate -0.003519 EmployeeCount 0.000000 StandardHours 0.000000 EmployeeNumber 0.016574 MonthlyRate 0.018578 Age 0.413286 TrainingTimesLastYear 0.553124 PercentSalaryHike 0.821128 YearsWithCurrManager 0.833451 YearsInCurrentRole 0.917363 DistanceFromHome 0.958118 StockOptionLevel 0.968980 JobLevel 1.025401 NumCompaniesWorked 1.026471 TotalWorkingYears 1.117172 MonthlyIncome 1.369817 YearsAtCompany 1.764529 Attrition 1.844366 PerformanceRating 1.921883 YearsSinceLastPromotion 1.984290 dtype: float64
df_attr.corr()
| Attrition | Age | DailyRate | DistanceFromHome | Education | EmployeeCount | EmployeeNumber | EnvironmentSatisfaction | HourlyRate | JobInvolvement | JobLevel | JobSatisfaction | MonthlyIncome | MonthlyRate | NumCompaniesWorked | PercentSalaryHike | PerformanceRating | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Attrition | 1.000000 | -0.159205 | -0.056652 | 0.077924 | -0.031373 | NaN | -0.010577 | -0.103369 | -0.006846 | -0.130016 | -0.169105 | -0.103481 | -0.159840 | 0.015170 | 0.043494 | -0.013478 | 0.002889 | -0.045872 | NaN | -0.137145 | -0.171063 | -0.059478 | -0.063939 | -0.134392 | -0.160545 | -0.033019 | -0.156199 |
| Age | -0.159205 | 1.000000 | 0.010661 | -0.001686 | 0.208034 | NaN | -0.010145 | 0.010146 | 0.024287 | 0.029820 | 0.509604 | -0.004892 | 0.497855 | 0.028051 | 0.299635 | 0.003634 | 0.001904 | 0.053535 | NaN | 0.037510 | 0.680381 | -0.019621 | -0.021490 | 0.311309 | 0.212901 | 0.216513 | 0.202089 |
| DailyRate | -0.056652 | 0.010661 | 1.000000 | -0.004985 | -0.016806 | NaN | -0.050990 | 0.018355 | 0.023381 | 0.046135 | 0.002966 | 0.030571 | 0.007707 | -0.032182 | 0.038153 | 0.022704 | 0.000473 | 0.007846 | NaN | 0.042143 | 0.014515 | 0.002453 | -0.037848 | -0.034055 | 0.009932 | -0.033229 | -0.026363 |
| DistanceFromHome | 0.077924 | -0.001686 | -0.004985 | 1.000000 | 0.021042 | NaN | 0.032916 | -0.016075 | 0.031131 | 0.008783 | 0.005303 | -0.003669 | -0.017014 | 0.027473 | -0.029251 | 0.040235 | 0.027110 | 0.006557 | NaN | 0.044872 | 0.004628 | -0.036942 | -0.026556 | 0.009508 | 0.018845 | 0.010029 | 0.014406 |
| Education | -0.031373 | 0.208034 | -0.016806 | 0.021042 | 1.000000 | NaN | 0.042070 | -0.027128 | 0.016775 | 0.042438 | 0.101589 | -0.011296 | 0.094961 | -0.026084 | 0.126317 | -0.011111 | -0.024539 | -0.009118 | NaN | 0.018422 | 0.148280 | -0.025100 | 0.009819 | 0.069114 | 0.060236 | 0.054254 | 0.069065 |
| EmployeeCount | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| EmployeeNumber | -0.010577 | -0.010145 | -0.050990 | 0.032916 | 0.042070 | NaN | 1.000000 | 0.017621 | 0.035179 | -0.006888 | -0.018519 | -0.046247 | -0.014829 | 0.012648 | -0.001251 | -0.012944 | -0.020359 | -0.069861 | NaN | 0.062227 | -0.014365 | 0.023603 | 0.010309 | -0.011240 | -0.008416 | -0.009019 | -0.009197 |
| EnvironmentSatisfaction | -0.103369 | 0.010146 | 0.018355 | -0.016075 | -0.027128 | NaN | 0.017621 | 1.000000 | -0.049857 | -0.008278 | 0.001212 | -0.006784 | -0.006259 | 0.037600 | 0.012594 | -0.031701 | -0.029548 | 0.007665 | NaN | 0.003432 | -0.002693 | -0.019359 | 0.027627 | 0.001458 | 0.018007 | 0.016194 | -0.004999 |
| HourlyRate | -0.006846 | 0.024287 | 0.023381 | 0.031131 | 0.016775 | NaN | 0.035179 | -0.049857 | 1.000000 | 0.042861 | -0.027853 | -0.071335 | -0.015794 | -0.015297 | 0.022157 | -0.009062 | -0.002172 | 0.001330 | NaN | 0.050263 | -0.002334 | -0.008548 | -0.004607 | -0.019582 | -0.024106 | -0.026716 | -0.020123 |
| JobInvolvement | -0.130016 | 0.029820 | 0.046135 | 0.008783 | 0.042438 | NaN | -0.006888 | -0.008278 | 0.042861 | 1.000000 | -0.012630 | -0.021476 | -0.015271 | -0.016322 | 0.015012 | -0.017205 | -0.029071 | 0.034297 | NaN | 0.021523 | -0.005533 | -0.015338 | -0.014617 | -0.021355 | 0.008717 | -0.024184 | 0.025976 |
| JobLevel | -0.169105 | 0.509604 | 0.002966 | 0.005303 | 0.101589 | NaN | -0.018519 | 0.001212 | -0.027853 | -0.012630 | 1.000000 | -0.001944 | 0.950300 | 0.039563 | 0.142501 | -0.034730 | -0.021222 | 0.021642 | NaN | 0.013984 | 0.782208 | -0.018191 | 0.037818 | 0.534739 | 0.389447 | 0.353885 | 0.375281 |
| JobSatisfaction | -0.103481 | -0.004892 | 0.030571 | -0.003669 | -0.011296 | NaN | -0.046247 | -0.006784 | -0.071335 | -0.021476 | -0.001944 | 1.000000 | -0.007157 | 0.000644 | -0.055699 | 0.020002 | 0.002297 | -0.012454 | NaN | 0.010690 | -0.020185 | -0.005779 | -0.019459 | -0.003803 | -0.002305 | -0.018214 | -0.027656 |
| MonthlyIncome | -0.159840 | 0.497855 | 0.007707 | -0.017014 | 0.094961 | NaN | -0.014829 | -0.006259 | -0.015794 | -0.015271 | 0.950300 | -0.007157 | 1.000000 | 0.034814 | 0.149515 | -0.027269 | -0.017120 | 0.025873 | NaN | 0.005408 | 0.772893 | -0.021736 | 0.030683 | 0.514285 | 0.363818 | 0.344978 | 0.344079 |
| MonthlyRate | 0.015170 | 0.028051 | -0.032182 | 0.027473 | -0.026084 | NaN | 0.012648 | 0.037600 | -0.015297 | -0.016322 | 0.039563 | 0.000644 | 0.034814 | 1.000000 | 0.017521 | -0.006429 | -0.009811 | -0.004085 | NaN | -0.034323 | 0.026442 | 0.001467 | 0.007963 | -0.023655 | -0.012815 | 0.001567 | -0.036746 |
| NumCompaniesWorked | 0.043494 | 0.299635 | 0.038153 | -0.029251 | 0.126317 | NaN | -0.001251 | 0.012594 | 0.022157 | 0.015012 | 0.142501 | -0.055699 | 0.149515 | 0.017521 | 1.000000 | -0.010238 | -0.014095 | 0.052733 | NaN | 0.030075 | 0.237639 | -0.066054 | -0.008366 | -0.118421 | -0.090754 | -0.036814 | -0.110319 |
| PercentSalaryHike | -0.013478 | 0.003634 | 0.022704 | 0.040235 | -0.011111 | NaN | -0.012944 | -0.031701 | -0.009062 | -0.017205 | -0.034730 | 0.020002 | -0.027269 | -0.006429 | -0.010238 | 1.000000 | 0.773550 | -0.040490 | NaN | 0.007528 | -0.020608 | -0.005221 | -0.003280 | -0.035991 | -0.001520 | -0.022154 | -0.011985 |
| PerformanceRating | 0.002889 | 0.001904 | 0.000473 | 0.027110 | -0.024539 | NaN | -0.020359 | -0.029548 | -0.002172 | -0.029071 | -0.021222 | 0.002297 | -0.017120 | -0.009811 | -0.014095 | 0.773550 | 1.000000 | -0.031351 | NaN | 0.003506 | 0.006744 | -0.015579 | 0.002572 | 0.003435 | 0.034986 | 0.017896 | 0.022827 |
| RelationshipSatisfaction | -0.045872 | 0.053535 | 0.007846 | 0.006557 | -0.009118 | NaN | -0.069861 | 0.007665 | 0.001330 | 0.034297 | 0.021642 | -0.012454 | 0.025873 | -0.004085 | 0.052733 | -0.040490 | -0.031351 | 1.000000 | NaN | -0.045952 | 0.024054 | 0.002497 | 0.019604 | 0.019367 | -0.015123 | 0.033493 | -0.000867 |
| StandardHours | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| StockOptionLevel | -0.137145 | 0.037510 | 0.042143 | 0.044872 | 0.018422 | NaN | 0.062227 | 0.003432 | 0.050263 | 0.021523 | 0.013984 | 0.010690 | 0.005408 | -0.034323 | 0.030075 | 0.007528 | 0.003506 | -0.045952 | NaN | 1.000000 | 0.010136 | 0.011274 | 0.004129 | 0.015058 | 0.050818 | 0.014352 | 0.024698 |
| TotalWorkingYears | -0.171063 | 0.680381 | 0.014515 | 0.004628 | 0.148280 | NaN | -0.014365 | -0.002693 | -0.002334 | -0.005533 | 0.782208 | -0.020185 | 0.772893 | 0.026442 | 0.237639 | -0.020608 | 0.006744 | 0.024054 | NaN | 0.010136 | 1.000000 | -0.035662 | 0.001008 | 0.628133 | 0.460365 | 0.404858 | 0.459188 |
| TrainingTimesLastYear | -0.059478 | -0.019621 | 0.002453 | -0.036942 | -0.025100 | NaN | 0.023603 | -0.019359 | -0.008548 | -0.015338 | -0.018191 | -0.005779 | -0.021736 | 0.001467 | -0.066054 | -0.005221 | -0.015579 | 0.002497 | NaN | 0.011274 | -0.035662 | 1.000000 | 0.028072 | 0.003569 | -0.005738 | -0.002067 | -0.004096 |
| WorkLifeBalance | -0.063939 | -0.021490 | -0.037848 | -0.026556 | 0.009819 | NaN | 0.010309 | 0.027627 | -0.004607 | -0.014617 | 0.037818 | -0.019459 | 0.030683 | 0.007963 | -0.008366 | -0.003280 | 0.002572 | 0.019604 | NaN | 0.004129 | 0.001008 | 0.028072 | 1.000000 | 0.012089 | 0.049856 | 0.008941 | 0.002759 |
| YearsAtCompany | -0.134392 | 0.311309 | -0.034055 | 0.009508 | 0.069114 | NaN | -0.011240 | 0.001458 | -0.019582 | -0.021355 | 0.534739 | -0.003803 | 0.514285 | -0.023655 | -0.118421 | -0.035991 | 0.003435 | 0.019367 | NaN | 0.015058 | 0.628133 | 0.003569 | 0.012089 | 1.000000 | 0.758754 | 0.618409 | 0.769212 |
| YearsInCurrentRole | -0.160545 | 0.212901 | 0.009932 | 0.018845 | 0.060236 | NaN | -0.008416 | 0.018007 | -0.024106 | 0.008717 | 0.389447 | -0.002305 | 0.363818 | -0.012815 | -0.090754 | -0.001520 | 0.034986 | -0.015123 | NaN | 0.050818 | 0.460365 | -0.005738 | 0.049856 | 0.758754 | 1.000000 | 0.548056 | 0.714365 |
| YearsSinceLastPromotion | -0.033019 | 0.216513 | -0.033229 | 0.010029 | 0.054254 | NaN | -0.009019 | 0.016194 | -0.026716 | -0.024184 | 0.353885 | -0.018214 | 0.344978 | 0.001567 | -0.036814 | -0.022154 | 0.017896 | 0.033493 | NaN | 0.014352 | 0.404858 | -0.002067 | 0.008941 | 0.618409 | 0.548056 | 1.000000 | 0.510224 |
| YearsWithCurrManager | -0.156199 | 0.202089 | -0.026363 | 0.014406 | 0.069065 | NaN | -0.009197 | -0.004999 | -0.020123 | 0.025976 | 0.375281 | -0.027656 | 0.344079 | -0.036746 | -0.110319 | -0.011985 | 0.022827 | -0.000867 | NaN | 0.024698 | 0.459188 | -0.004096 | 0.002759 | 0.769212 | 0.714365 | 0.510224 | 1.000000 |
# Perform Test of Hypothesis :- Compare rates for same level male & female, check relationship between
# categorical variable like Age & Gender / Gender & Education field, Age & Income etc
df_train6 = pd.read_csv(r'G:\ETLHive Pune\Project\HR-Data Atrition Project\HR-Employee-Attrition-Table 1.csv')
df_train6.head()
| Attrition | Age | BusinessTravel | DailyRate | Department | DistanceFromHome | Education | EducationField | EmployeeCount | EmployeeNumber | EnvironmentSatisfaction | Gender | HourlyRate | JobInvolvement | JobLevel | JobRole | JobSatisfaction | MaritalStatus | MonthlyIncome | MonthlyRate | NumCompaniesWorked | Over18 | OverTime | PercentSalaryHike | PerformanceRating | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 41 | Travel_Rarely | 1102 | Sales | 1 | 2 | Life Sciences | 1 | 1 | 2 | Female | 94 | 3 | 2 | Sales Executive | 4 | Single | 5993 | 19479 | 8 | Y | Yes | 11 | 3 | 1 | 80 | 0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 |
| 1 | 0 | 49 | Travel_Frequently | 279 | Research & Development | 8 | 1 | Life Sciences | 1 | 2 | 3 | Male | 61 | 2 | 2 | Research Scientist | 2 | Married | 5130 | 24907 | 1 | Y | No | 23 | 4 | 4 | 80 | 1 | 10 | 3 | 3 | 10 | 7 | 1 | 7 |
| 2 | 1 | 37 | Travel_Rarely | 1373 | Research & Development | 2 | 2 | Other | 1 | 4 | 4 | Male | 92 | 2 | 1 | Laboratory Technician | 3 | Single | 2090 | 2396 | 6 | Y | Yes | 15 | 3 | 2 | 80 | 0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 |
| 3 | 0 | 33 | Travel_Frequently | 1392 | Research & Development | 3 | 4 | Life Sciences | 1 | 5 | 4 | Female | 56 | 3 | 1 | Research Scientist | 3 | Married | 2909 | 23159 | 1 | Y | Yes | 11 | 3 | 3 | 80 | 0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 |
| 4 | 0 | 27 | Travel_Rarely | 591 | Research & Development | 2 | 1 | Medical | 1 | 7 | 1 | Male | 40 | 3 | 1 | Laboratory Technician | 2 | Married | 3468 | 16632 | 9 | Y | No | 12 | 3 | 4 | 80 | 1 | 6 | 3 | 3 | 2 | 2 | 2 | 2 |
df_train6.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1470 entries, 0 to 1469 Data columns (total 35 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Attrition 1470 non-null int64 1 Age 1470 non-null int64 2 BusinessTravel 1470 non-null object 3 DailyRate 1470 non-null int64 4 Department 1470 non-null object 5 DistanceFromHome 1470 non-null int64 6 Education 1470 non-null int64 7 EducationField 1470 non-null object 8 EmployeeCount 1470 non-null int64 9 EmployeeNumber 1470 non-null int64 10 EnvironmentSatisfaction 1470 non-null int64 11 Gender 1470 non-null object 12 HourlyRate 1470 non-null int64 13 JobInvolvement 1470 non-null int64 14 JobLevel 1470 non-null int64 15 JobRole 1470 non-null object 16 JobSatisfaction 1470 non-null int64 17 MaritalStatus 1470 non-null object 18 MonthlyIncome 1470 non-null int64 19 MonthlyRate 1470 non-null int64 20 NumCompaniesWorked 1470 non-null int64 21 Over18 1470 non-null object 22 OverTime 1470 non-null object 23 PercentSalaryHike 1470 non-null int64 24 PerformanceRating 1470 non-null int64 25 RelationshipSatisfaction 1470 non-null int64 26 StandardHours 1470 non-null int64 27 StockOptionLevel 1470 non-null int64 28 TotalWorkingYears 1470 non-null int64 29 TrainingTimesLastYear 1470 non-null int64 30 WorkLifeBalance 1470 non-null int64 31 YearsAtCompany 1470 non-null int64 32 YearsInCurrentRole 1470 non-null int64 33 YearsSinceLastPromotion 1470 non-null int64 34 YearsWithCurrManager 1470 non-null int64 dtypes: int64(27), object(8) memory usage: 402.1+ KB
# 1] Comparing BusinessTravel vs Gender
BG = pd.crosstab(df_train5_cat1.BusinessTravel,df_train6.Gender)
BG
| Gender | Female | Male |
|---|---|---|
| BusinessTravel | ||
| Non-Travel | 49 | 101 |
| Travel_Frequently | 117 | 160 |
| Travel_Rarely | 422 | 621 |
from scipy.stats import chi2_contingency
chi_sqr, p_value, DOF, EXP = chi2_contingency(BG)
chi_sqr
4.031372310350092
p_value
0.13322895625828154
DOF
2
EXP
array([[ 60. , 90. ],
[110.8, 166.2],
[417.2, 625.8]])
sns.countplot(x=df_train6.BusinessTravel,hue=df_train6.Gender)
<AxesSubplot: xlabel='BusinessTravel', ylabel='count'>
# 2] comparing Gender vs Monthlyrate
GM = pd.crosstab(df_train6.Gender,df_train6.MonthlyRate)
GM
| MonthlyRate | 2094 | 2097 | 2104 | 2112 | 2122 | 2125 | 2137 | 2227 | 2243 | 2253 | 2261 | 2288 | 2302 | 2323 | 2326 | 2338 | 2354 | 2373 | 2396 | 2437 | 2447 | 2493 | 2539 | 2560 | 2561 | 2613 | 2671 | 2689 | 2690 | 2706 | 2721 | 2725 | 2739 | 2755 | 2819 | 2823 | 2845 | 2851 | 2890 | 2900 | 2912 | 2939 | 2967 | 2975 | 2993 | 2997 | 3010 | 3020 | 3031 | 3032 | 3064 | 3072 | 3088 | 3119 | 3129 | 3140 | 3142 | 3156 | 3157 | 3164 | 3173 | 3193 | 3208 | 3297 | 3300 | 3334 | 3335 | 3339 | 3356 | 3372 | 3376 | 3395 | 3415 | 3423 | 3425 | 3427 | 3445 | 3449 | 3458 | 3465 | 3487 | 3498 | 3525 | 3536 | 3549 | 3567 | 3622 | 3666 | 3687 | 3692 | 3698 | 3708 | 3735 | 3787 | 3809 | 3810 | 3811 | 3835 | 3840 | 3854 | ... | 24852 | 24907 | 24920 | 24941 | 24978 | 25043 | 25063 | 25098 | 25103 | 25150 | 25166 | 25174 | 25178 | 25198 | 25233 | 25258 | 25265 | 25275 | 25291 | 25308 | 25326 | 25348 | 25353 | 25388 | 25412 | 25422 | 25440 | 25470 | 25479 | 25518 | 25527 | 25549 | 25592 | 25594 | 25605 | 25657 | 25681 | 25713 | 25725 | 25751 | 25755 | 25761 | 25796 | 25800 | 25811 | 25812 | 25846 | 25927 | 25949 | 25952 | 25995 | 26009 | 26062 | 26075 | 26076 | 26085 | 26092 | 26124 | 26176 | 26186 | 26204 | 26227 | 26236 | 26250 | 26278 | 26283 | 26285 | 26308 | 26312 | 26314 | 26342 | 26362 | 26376 | 26427 | 26458 | 26493 | 26496 | 26507 | 26537 | 26542 | 26551 | 26582 | 26589 | 26619 | 26703 | 26707 | 26767 | 26820 | 26841 | 26849 | 26862 | 26894 | 26897 | 26914 | 26933 | 26956 | 26959 | 26968 | 26997 | 26999 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Gender | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| Female | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 1 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 1 | 0 | 1 | 1 | 1 | 1 | 0 | 1 | 1 | 0 | 0 | 1 | 1 | 0 |
| Male | 0 | 1 | 1 | 0 | 1 | 2 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 1 | 1 | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 0 | 2 | 1 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 1 | 1 | 1 | 1 | 1 | 0 | 1 | 1 | 0 | 1 | 1 | 2 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 1 | 1 | 1 | 0 | 0 | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 0 | ... | 1 | 1 | 0 | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 0 | 1 | 1 | 0 | 1 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 0 | 1 | 1 | 0 | 1 | 0 | 1 | 1 | 0 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 1 | 0 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 1 |
2 rows × 1427 columns
chi_sqr, p_value, DOF, EXP = chi2_contingency(GM)
chi_sqr
1421.388888888889
p_value
0.5294748183389015
DOF
1426
# 3] comparing Gender vs Department
GD = pd.crosstab(df_train6.Gender,df_train6.Department)
GD
| Department | Human Resources | Research & Development | Sales |
|---|---|---|---|
| Gender | |||
| Female | 20 | 379 | 189 |
| Male | 43 | 582 | 257 |
plt.figure(figsize=(10,9))
sns.countplot(x=df_train6.Gender,hue=df_train6.Department)
<AxesSubplot: xlabel='Gender', ylabel='count'>
# 4] Comparing Gender vs Age
GA = pd.crosstab(df_train6.Gender,df_train6.Age)
GA
| Age | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Gender | |||||||||||||||||||||||||||||||||||||||||||
| Female | 4 | 4 | 6 | 6 | 5 | 1 | 11 | 6 | 14 | 22 | 12 | 29 | 26 | 28 | 23 | 24 | 33 | 27 | 29 | 18 | 27 | 16 | 24 | 15 | 16 | 14 | 10 | 17 | 15 | 14 | 7 | 12 | 9 | 7 | 7 | 10 | 15 | 4 | 7 | 0 | 6 | 6 | 2 |
| Male | 4 | 5 | 5 | 7 | 11 | 13 | 15 | 20 | 25 | 26 | 36 | 39 | 34 | 41 | 38 | 34 | 44 | 51 | 40 | 32 | 31 | 26 | 33 | 25 | 30 | 18 | 23 | 24 | 18 | 10 | 12 | 12 | 21 | 12 | 11 | 9 | 3 | 18 | 7 | 4 | 8 | 4 | 3 |
chi_sqr, p_value, DOF, EXP = chi2_contingency(GA)
chi_sqr
53.537114950915246
p_value
0.10927801670328505
# 5] comparing Overtime and Age
OA = pd.crosstab(df_train6.OverTime,df_train6.Age)
OA
| Age | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| OverTime | |||||||||||||||||||||||||||||||||||||||||||
| No | 6 | 5 | 7 | 10 | 9 | 11 | 18 | 18 | 27 | 36 | 36 | 49 | 48 | 49 | 50 | 40 | 60 | 52 | 53 | 34 | 40 | 33 | 42 | 27 | 31 | 24 | 18 | 31 | 25 | 14 | 17 | 18 | 17 | 15 | 13 | 14 | 15 | 12 | 11 | 3 | 5 | 7 | 4 |
| Yes | 2 | 4 | 4 | 3 | 7 | 3 | 8 | 8 | 12 | 12 | 12 | 19 | 12 | 20 | 11 | 18 | 17 | 26 | 16 | 16 | 18 | 9 | 15 | 13 | 15 | 8 | 15 | 10 | 8 | 10 | 2 | 6 | 13 | 4 | 5 | 5 | 3 | 10 | 3 | 1 | 9 | 3 | 1 |
chi_sqr, p_value, DOF, EXP = chi2_contingency(OA)
chi_sqr
44.366828521428644
p_value
0.37220025607537444
# 6] Compairing Maritalstatus and Gender
MG=pd.crosstab(df_train6.MaritalStatus,df_train6.Gender)
MG
| Gender | Female | Male |
|---|---|---|
| MaritalStatus | ||
| Divorced | 117 | 210 |
| Married | 272 | 401 |
| Single | 199 | 271 |
chi_sqr, p_value, DOF, EXP = chi2_contingency(MG)
chi_sqr
3.5478394206821307
p_value
0.1696666396487212
plt.figure(figsize=(10,9))
sns.countplot(x=df_train6.MaritalStatus, hue = df_train6.Gender)
<AxesSubplot: xlabel='MaritalStatus', ylabel='count'>
# Compare age and Monthly income
AM = pd.crosstab(df_train6.Age,df_train6.MonthlyIncome)
AM.head()
| MonthlyIncome | 1009 | 1051 | 1052 | 1081 | 1091 | 1102 | 1118 | 1129 | 1200 | 1223 | 1232 | 1261 | 1274 | 1281 | 1359 | 1393 | 1416 | 1420 | 1483 | 1514 | 1555 | 1563 | 1569 | 1601 | 1611 | 1675 | 1702 | 1706 | 1790 | 1859 | 1878 | 1904 | 1951 | 2001 | 2007 | 2008 | 2011 | 2013 | 2014 | 2018 | 2022 | 2024 | 2028 | 2029 | 2033 | 2042 | 2044 | 2045 | 2058 | 2061 | 2062 | 2064 | 2066 | 2070 | 2073 | 2074 | 2075 | 2080 | 2083 | 2086 | 2088 | 2089 | 2090 | 2093 | 2096 | 2097 | 2099 | 2105 | 2107 | 2109 | 2115 | 2119 | 2121 | 2127 | 2132 | 2133 | 2141 | 2143 | 2144 | 2145 | 2148 | 2153 | 2154 | 2157 | 2166 | 2168 | 2174 | 2176 | 2177 | 2180 | 2187 | 2194 | 2201 | 2206 | 2207 | 2210 | 2213 | 2216 | 2218 | 2220 | ... | 16856 | 16872 | 16880 | 16885 | 16959 | 17007 | 17046 | 17048 | 17068 | 17099 | 17123 | 17159 | 17169 | 17174 | 17181 | 17328 | 17399 | 17426 | 17444 | 17465 | 17567 | 17584 | 17603 | 17639 | 17650 | 17665 | 17779 | 17856 | 17861 | 17875 | 17924 | 18041 | 18061 | 18172 | 18200 | 18213 | 18265 | 18300 | 18303 | 18430 | 18606 | 18665 | 18711 | 18722 | 18740 | 18789 | 18824 | 18844 | 18880 | 18947 | 19033 | 19038 | 19045 | 19049 | 19068 | 19081 | 19094 | 19141 | 19144 | 19161 | 19187 | 19189 | 19190 | 19197 | 19202 | 19232 | 19237 | 19246 | 19272 | 19328 | 19331 | 19392 | 19406 | 19419 | 19431 | 19436 | 19502 | 19513 | 19517 | 19537 | 19545 | 19566 | 19586 | 19613 | 19626 | 19627 | 19636 | 19658 | 19665 | 19701 | 19717 | 19740 | 19833 | 19845 | 19847 | 19859 | 19926 | 19943 | 19973 | 19999 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Age | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 18 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 19 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 20 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 21 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 22 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 1349 columns
chi_sqr, p_value, DOF, EXP = chi2_contingency(AM)
chi_sqr
57003.01607044047
p_value
0.12517635416744993
# Comparing Educationfield and Gender
EG = pd.crosstab(df_train6.EducationField,df_train6.Gender)
EG.head()
| Gender | Female | Male |
|---|---|---|
| EducationField | ||
| Human Resources | 8 | 19 |
| Life Sciences | 240 | 366 |
| Marketing | 69 | 90 |
| Medical | 190 | 274 |
| Other | 29 | 53 |
chi_sqr,p_value,DOF,EXP = chi2_contingency(EG)
chi_sqr
2.9414238793151797
p_value
0.7090162522843911
plt.figure(figsize=(10,9))
sns.countplot(x=df_train6.EducationField,hue=df_train6.Gender)
<AxesSubplot: xlabel='EducationField', ylabel='count'>
# Comparing Hourly rate and Age
ht = pd.crosstab(df_train6.HourlyRate,df_train6.Gender)
ht
| Gender | Female | Male |
|---|---|---|
| HourlyRate | ||
| 30 | 11 | 8 |
| 31 | 8 | 7 |
| 32 | 8 | 16 |
| 33 | 12 | 7 |
| 34 | 5 | 7 |
| ... | ... | ... |
| 96 | 12 | 15 |
| 97 | 7 | 14 |
| 98 | 13 | 15 |
| 99 | 7 | 13 |
| 100 | 7 | 12 |
71 rows × 2 columns
chi_sqr,p_value,DOF,EXP = chi2_contingency(ht)
chi_sqr
66.92505276398174
p_value
0.5820716429499165
# Use 1] SLR, 2] MLR, 3] Polynomial Regression, 4] Interaction term, 5] Ridge Regression, 6] Lasso etc
DF_S = pd.read_csv(r'G:\ETLHive Pune\Project\HR-Data Atrition Project\HR-Employee-Attrition-Table 1.csv')
DF_S.head()
| Attrition | Age | BusinessTravel | DailyRate | Department | DistanceFromHome | Education | EducationField | EmployeeCount | EmployeeNumber | EnvironmentSatisfaction | Gender | HourlyRate | JobInvolvement | JobLevel | JobRole | JobSatisfaction | MaritalStatus | MonthlyIncome | MonthlyRate | NumCompaniesWorked | Over18 | OverTime | PercentSalaryHike | PerformanceRating | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 41 | Travel_Rarely | 1102 | Sales | 1 | 2 | Life Sciences | 1 | 1 | 2 | Female | 94 | 3 | 2 | Sales Executive | 4 | Single | 5993 | 19479 | 8 | Y | Yes | 11 | 3 | 1 | 80 | 0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 |
| 1 | 0 | 49 | Travel_Frequently | 279 | Research & Development | 8 | 1 | Life Sciences | 1 | 2 | 3 | Male | 61 | 2 | 2 | Research Scientist | 2 | Married | 5130 | 24907 | 1 | Y | No | 23 | 4 | 4 | 80 | 1 | 10 | 3 | 3 | 10 | 7 | 1 | 7 |
| 2 | 1 | 37 | Travel_Rarely | 1373 | Research & Development | 2 | 2 | Other | 1 | 4 | 4 | Male | 92 | 2 | 1 | Laboratory Technician | 3 | Single | 2090 | 2396 | 6 | Y | Yes | 15 | 3 | 2 | 80 | 0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 |
| 3 | 0 | 33 | Travel_Frequently | 1392 | Research & Development | 3 | 4 | Life Sciences | 1 | 5 | 4 | Female | 56 | 3 | 1 | Research Scientist | 3 | Married | 2909 | 23159 | 1 | Y | Yes | 11 | 3 | 3 | 80 | 0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 |
| 4 | 0 | 27 | Travel_Rarely | 591 | Research & Development | 2 | 1 | Medical | 1 | 7 | 1 | Male | 40 | 3 | 1 | Laboratory Technician | 2 | Married | 3468 | 16632 | 9 | Y | No | 12 | 3 | 4 | 80 | 1 | 6 | 3 | 3 | 2 | 2 | 2 | 2 |
# Splitting the datasets into training and testing datasets
x = DF_S[['DailyRate']]
y = DF_S[['MonthlyRate']]
x
| DailyRate | |
|---|---|
| 0 | 1102 |
| 1 | 279 |
| 2 | 1373 |
| 3 | 1392 |
| 4 | 591 |
| ... | ... |
| 1465 | 884 |
| 1466 | 613 |
| 1467 | 155 |
| 1468 | 1023 |
| 1469 | 628 |
1470 rows × 1 columns
y
| MonthlyRate | |
|---|---|
| 0 | 19479 |
| 1 | 24907 |
| 2 | 2396 |
| 3 | 23159 |
| 4 | 16632 |
| ... | ... |
| 1465 | 12290 |
| 1466 | 21457 |
| 1467 | 5174 |
| 1468 | 13243 |
| 1469 | 10228 |
1470 rows × 1 columns
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.20,random_state=31)
xtrain
| DailyRate | |
|---|---|
| 815 | 984 |
| 1187 | 1189 |
| 1321 | 207 |
| 728 | 1441 |
| 387 | 759 |
| ... | ... |
| 826 | 433 |
| 610 | 269 |
| 894 | 685 |
| 16 | 334 |
| 722 | 1391 |
1176 rows × 1 columns
xtest
| DailyRate | |
|---|---|
| 1343 | 592 |
| 334 | 549 |
| 1136 | 329 |
| 1080 | 228 |
| 396 | 1473 |
| ... | ... |
| 91 | 632 |
| 520 | 817 |
| 1403 | 119 |
| 479 | 1287 |
| 1135 | 563 |
294 rows × 1 columns
# Model Building
lin_model = LinearRegression()
lin_model.fit(xtrain,ytrain)
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LinearRegression()
# Training data evaluation
ypredtrain = lin_model.predict(xtrain)
MAE = mean_absolute_error(ytrain,ypredtrain)
print('Mean absolute error:',MAE)
MSE = mean_squared_error(ytrain,ypredtrain)
print('Mean squared error:',MSE)
RMSE = np.sqrt(MSE)
print('Root mean squared error:',RMSE)
RSquared = r2_score(ytrain,ypredtrain)
print('R-Squared:',RSquared)
AdjRsquared = 1-((1-RSquared)*(len(xtrain)-1)/(len(xtrain)-len(x.columns)-1))
print('AdjRsquared:',AdjRsquared)
Mean absolute error: 6270.772185623379 Mean squared error: 51623223.78228901 Root mean squared error: 7184.930325499964 R-Squared: 0.0010404518598107204 AdjRsquared: 0.0001895493486180344
# Testing data evaluation
ypredtest = lin_model.predict(xtest)
MAE = mean_absolute_error(ytest,ypredtest)
print('Mean absolute error:',MAE)
MSE = mean_squared_error(ytest,ypredtest)
print('Mean squared error:',MSE)
RMSE = np.sqrt(MSE)
print('Root mean squared error:',RMSE)
RSquared = r2_score(ytest,ypredtest)
print('R-Squared:',RSquared)
AdjRsquared = 1-((1-RSquared)*(len(xtest)-1)/(len(xtest)-len(x.columns)-1))
print('AdjRsquared:',AdjRsquared)
Mean absolute error: 5849.367272215098 Mean squared error: 46390905.9967264 Root mean squared error: 6811.08699083534 R-Squared: 0.0007109055817979826 AdjRsquared: -0.0027113173442918637
# Read the data
df_7 = pd.read_csv(r'G:\ETLHive Pune\Project\HR-Data Atrition Project\HR-Employee-Attrition-Table 1.csv')
df_7.head()
| Attrition | Age | BusinessTravel | DailyRate | Department | DistanceFromHome | Education | EducationField | EmployeeCount | EmployeeNumber | EnvironmentSatisfaction | Gender | HourlyRate | JobInvolvement | JobLevel | JobRole | JobSatisfaction | MaritalStatus | MonthlyIncome | MonthlyRate | NumCompaniesWorked | Over18 | OverTime | PercentSalaryHike | PerformanceRating | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 41 | Travel_Rarely | 1102 | Sales | 1 | 2 | Life Sciences | 1 | 1 | 2 | Female | 94 | 3 | 2 | Sales Executive | 4 | Single | 5993 | 19479 | 8 | Y | Yes | 11 | 3 | 1 | 80 | 0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 |
| 1 | 0 | 49 | Travel_Frequently | 279 | Research & Development | 8 | 1 | Life Sciences | 1 | 2 | 3 | Male | 61 | 2 | 2 | Research Scientist | 2 | Married | 5130 | 24907 | 1 | Y | No | 23 | 4 | 4 | 80 | 1 | 10 | 3 | 3 | 10 | 7 | 1 | 7 |
| 2 | 1 | 37 | Travel_Rarely | 1373 | Research & Development | 2 | 2 | Other | 1 | 4 | 4 | Male | 92 | 2 | 1 | Laboratory Technician | 3 | Single | 2090 | 2396 | 6 | Y | Yes | 15 | 3 | 2 | 80 | 0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 |
| 3 | 0 | 33 | Travel_Frequently | 1392 | Research & Development | 3 | 4 | Life Sciences | 1 | 5 | 4 | Female | 56 | 3 | 1 | Research Scientist | 3 | Married | 2909 | 23159 | 1 | Y | Yes | 11 | 3 | 3 | 80 | 0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 |
| 4 | 0 | 27 | Travel_Rarely | 591 | Research & Development | 2 | 1 | Medical | 1 | 7 | 1 | Male | 40 | 3 | 1 | Laboratory Technician | 2 | Married | 3468 | 16632 | 9 | Y | No | 12 | 3 | 4 | 80 | 1 | 6 | 3 | 3 | 2 | 2 | 2 | 2 |
y = df_7[['MonthlyRate']]
df_7tr = df_7.drop('MonthlyRate',axis=1)
df_7tr
| Attrition | Age | BusinessTravel | DailyRate | Department | DistanceFromHome | Education | EducationField | EmployeeCount | EmployeeNumber | EnvironmentSatisfaction | Gender | HourlyRate | JobInvolvement | JobLevel | JobRole | JobSatisfaction | MaritalStatus | MonthlyIncome | NumCompaniesWorked | Over18 | OverTime | PercentSalaryHike | PerformanceRating | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 41 | Travel_Rarely | 1102 | Sales | 1 | 2 | Life Sciences | 1 | 1 | 2 | Female | 94 | 3 | 2 | Sales Executive | 4 | Single | 5993 | 8 | Y | Yes | 11 | 3 | 1 | 80 | 0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 |
| 1 | 0 | 49 | Travel_Frequently | 279 | Research & Development | 8 | 1 | Life Sciences | 1 | 2 | 3 | Male | 61 | 2 | 2 | Research Scientist | 2 | Married | 5130 | 1 | Y | No | 23 | 4 | 4 | 80 | 1 | 10 | 3 | 3 | 10 | 7 | 1 | 7 |
| 2 | 1 | 37 | Travel_Rarely | 1373 | Research & Development | 2 | 2 | Other | 1 | 4 | 4 | Male | 92 | 2 | 1 | Laboratory Technician | 3 | Single | 2090 | 6 | Y | Yes | 15 | 3 | 2 | 80 | 0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 |
| 3 | 0 | 33 | Travel_Frequently | 1392 | Research & Development | 3 | 4 | Life Sciences | 1 | 5 | 4 | Female | 56 | 3 | 1 | Research Scientist | 3 | Married | 2909 | 1 | Y | Yes | 11 | 3 | 3 | 80 | 0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 |
| 4 | 0 | 27 | Travel_Rarely | 591 | Research & Development | 2 | 1 | Medical | 1 | 7 | 1 | Male | 40 | 3 | 1 | Laboratory Technician | 2 | Married | 3468 | 9 | Y | No | 12 | 3 | 4 | 80 | 1 | 6 | 3 | 3 | 2 | 2 | 2 | 2 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1465 | 0 | 36 | Travel_Frequently | 884 | Research & Development | 23 | 2 | Medical | 1 | 2061 | 3 | Male | 41 | 4 | 2 | Laboratory Technician | 4 | Married | 2571 | 4 | Y | No | 17 | 3 | 3 | 80 | 1 | 17 | 3 | 3 | 5 | 2 | 0 | 3 |
| 1466 | 0 | 39 | Travel_Rarely | 613 | Research & Development | 6 | 1 | Medical | 1 | 2062 | 4 | Male | 42 | 2 | 3 | Healthcare Representative | 1 | Married | 9991 | 4 | Y | No | 15 | 3 | 1 | 80 | 1 | 9 | 5 | 3 | 7 | 7 | 1 | 7 |
| 1467 | 0 | 27 | Travel_Rarely | 155 | Research & Development | 4 | 3 | Life Sciences | 1 | 2064 | 2 | Male | 87 | 4 | 2 | Manufacturing Director | 2 | Married | 6142 | 1 | Y | Yes | 20 | 4 | 2 | 80 | 1 | 6 | 0 | 3 | 6 | 2 | 0 | 3 |
| 1468 | 0 | 49 | Travel_Frequently | 1023 | Sales | 2 | 3 | Medical | 1 | 2065 | 4 | Male | 63 | 2 | 2 | Sales Executive | 2 | Married | 5390 | 2 | Y | No | 14 | 3 | 4 | 80 | 0 | 17 | 3 | 2 | 9 | 6 | 0 | 8 |
| 1469 | 0 | 34 | Travel_Rarely | 628 | Research & Development | 8 | 3 | Medical | 1 | 2068 | 2 | Male | 82 | 4 | 2 | Laboratory Technician | 3 | Married | 4404 | 2 | Y | No | 12 | 3 | 1 | 80 | 0 | 6 | 3 | 4 | 4 | 3 | 1 | 2 |
1470 rows × 34 columns
# Seprate Categorical and continious columns
cat = []
con = []
for i in df_7tr.columns:
if (df_7tr[i].dtypes == 'object'):
cat.append(i)
else:
con.append(i)
df_train7_con = con
df_train7_con = df_7tr[con]
df_train7_con
| Attrition | Age | DailyRate | DistanceFromHome | Education | EmployeeCount | EmployeeNumber | EnvironmentSatisfaction | HourlyRate | JobInvolvement | JobLevel | JobSatisfaction | MonthlyIncome | NumCompaniesWorked | PercentSalaryHike | PerformanceRating | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 41 | 1102 | 1 | 2 | 1 | 1 | 2 | 94 | 3 | 2 | 4 | 5993 | 8 | 11 | 3 | 1 | 80 | 0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 |
| 1 | 0 | 49 | 279 | 8 | 1 | 1 | 2 | 3 | 61 | 2 | 2 | 2 | 5130 | 1 | 23 | 4 | 4 | 80 | 1 | 10 | 3 | 3 | 10 | 7 | 1 | 7 |
| 2 | 1 | 37 | 1373 | 2 | 2 | 1 | 4 | 4 | 92 | 2 | 1 | 3 | 2090 | 6 | 15 | 3 | 2 | 80 | 0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 |
| 3 | 0 | 33 | 1392 | 3 | 4 | 1 | 5 | 4 | 56 | 3 | 1 | 3 | 2909 | 1 | 11 | 3 | 3 | 80 | 0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 |
| 4 | 0 | 27 | 591 | 2 | 1 | 1 | 7 | 1 | 40 | 3 | 1 | 2 | 3468 | 9 | 12 | 3 | 4 | 80 | 1 | 6 | 3 | 3 | 2 | 2 | 2 | 2 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1465 | 0 | 36 | 884 | 23 | 2 | 1 | 2061 | 3 | 41 | 4 | 2 | 4 | 2571 | 4 | 17 | 3 | 3 | 80 | 1 | 17 | 3 | 3 | 5 | 2 | 0 | 3 |
| 1466 | 0 | 39 | 613 | 6 | 1 | 1 | 2062 | 4 | 42 | 2 | 3 | 1 | 9991 | 4 | 15 | 3 | 1 | 80 | 1 | 9 | 5 | 3 | 7 | 7 | 1 | 7 |
| 1467 | 0 | 27 | 155 | 4 | 3 | 1 | 2064 | 2 | 87 | 4 | 2 | 2 | 6142 | 1 | 20 | 4 | 2 | 80 | 1 | 6 | 0 | 3 | 6 | 2 | 0 | 3 |
| 1468 | 0 | 49 | 1023 | 2 | 3 | 1 | 2065 | 4 | 63 | 2 | 2 | 2 | 5390 | 2 | 14 | 3 | 4 | 80 | 0 | 17 | 3 | 2 | 9 | 6 | 0 | 8 |
| 1469 | 0 | 34 | 628 | 8 | 3 | 1 | 2068 | 2 | 82 | 4 | 2 | 3 | 4404 | 2 | 12 | 3 | 1 | 80 | 0 | 6 | 3 | 4 | 4 | 3 | 1 | 2 |
1470 rows × 26 columns
df_train7_cat = cat
df_train7_cat = df_7tr[cat]
df_train7_cat
| BusinessTravel | Department | EducationField | Gender | JobRole | MaritalStatus | Over18 | OverTime | |
|---|---|---|---|---|---|---|---|---|
| 0 | Travel_Rarely | Sales | Life Sciences | Female | Sales Executive | Single | Y | Yes |
| 1 | Travel_Frequently | Research & Development | Life Sciences | Male | Research Scientist | Married | Y | No |
| 2 | Travel_Rarely | Research & Development | Other | Male | Laboratory Technician | Single | Y | Yes |
| 3 | Travel_Frequently | Research & Development | Life Sciences | Female | Research Scientist | Married | Y | Yes |
| 4 | Travel_Rarely | Research & Development | Medical | Male | Laboratory Technician | Married | Y | No |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1465 | Travel_Frequently | Research & Development | Medical | Male | Laboratory Technician | Married | Y | No |
| 1466 | Travel_Rarely | Research & Development | Medical | Male | Healthcare Representative | Married | Y | No |
| 1467 | Travel_Rarely | Research & Development | Life Sciences | Male | Manufacturing Director | Married | Y | Yes |
| 1468 | Travel_Frequently | Sales | Medical | Male | Sales Executive | Married | Y | No |
| 1469 | Travel_Rarely | Research & Development | Medical | Male | Laboratory Technician | Married | Y | No |
1470 rows × 8 columns
# Visulaisation of categorical columns
plt.figure(figsize=(15,19))
for x1,i in enumerate(df_train7_cat.columns):
if df_train7_cat[i].dtypes=='object':
plt.subplot(3,3,x1+1)
sns.countplot(x=df_train7_cat[i])
# Visualisation continiuos columns
plt.figure(figsize=(17,19))
for x1,i in enumerate(df_train7_con.columns):
if df_train7_con[i].dtypes=='int64' or df_train7_con[i].dtypes=='float64':
plt.subplot(9,3,x1+1)
sns.boxplot(df_train7_con[i])
for i in df_train7_con.columns:
q1 = df_train7_con[i].quantile(0.25)
q3 = df_train7_con[i].quantile(0.75)
IQR = q3-q1
uppertail = q3+1.5*IQR
lowertail = q1-1.5*IQR
df_train7_con.loc[(df_train7_con[i]>uppertail) | (df_train7_con[i]<lowertail)]
mean_7 = df_train7_con[i].mean()
df_train7_con.loc[(df_train7_con[i]>uppertail) | (df_train7_con[i]<lowertail),i],mean_7
plt.figure(figsize=(17,19))
for x1,i in enumerate(df_train7_con.columns):
if df_train7_con[i].dtypes=='int64' or df_train7_con[i].dtypes=='float64':
plt.subplot(9,3,x1+1)
sns.boxplot(df_train7_con[i])
# One hot encoding
df_train7_dum = pd.get_dummies(df_train7_cat)
df_train7_dum.head()
| BusinessTravel_Non-Travel | BusinessTravel_Travel_Frequently | BusinessTravel_Travel_Rarely | Department_Human Resources | Department_Research & Development | Department_Sales | EducationField_Human Resources | EducationField_Life Sciences | EducationField_Marketing | EducationField_Medical | EducationField_Other | EducationField_Technical Degree | Gender_Female | Gender_Male | JobRole_Healthcare Representative | JobRole_Human Resources | JobRole_Laboratory Technician | JobRole_Manager | JobRole_Manufacturing Director | JobRole_Research Director | JobRole_Research Scientist | JobRole_Sales Executive | JobRole_Sales Representative | MaritalStatus_Divorced | MaritalStatus_Married | MaritalStatus_Single | Over18_Y | OverTime_No | OverTime_Yes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 1 |
| 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 0 |
| 2 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 |
| 3 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 |
| 4 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 0 |
# Standardization
std_scaler = StandardScaler()
std_scaler1 = std_scaler.fit_transform(df_train7_con)
x_lin = pd.DataFrame(std_scaler1,columns=df_train7_con.columns)
x_lin.head()
| Attrition | Age | DailyRate | DistanceFromHome | Education | EmployeeCount | EmployeeNumber | EnvironmentSatisfaction | HourlyRate | JobInvolvement | JobLevel | JobSatisfaction | MonthlyIncome | NumCompaniesWorked | PercentSalaryHike | PerformanceRating | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2.280906 | 0.446350 | 0.742527 | -1.010909 | -0.891688 | 0.0 | -1.701283 | -0.660531 | 1.383138 | 0.379672 | -0.057788 | 1.153254 | -0.108350 | 2.125136 | -1.150554 | -0.426230 | -1.584178 | 0.0 | -0.932014 | -0.421642 | -2.171982 | -2.493820 | -0.164613 | -0.063296 | -0.679146 | 0.245834 |
| 1 | -0.438422 | 1.322365 | -1.297775 | -0.147150 | -1.868426 | 0.0 | -1.699621 | 0.254625 | -0.240677 | -1.026167 | -0.057788 | -0.660853 | -0.291719 | -0.678049 | 2.129306 | 2.346151 | 1.191438 | 0.0 | 0.241988 | -0.164511 | 0.155707 | 0.338096 | 0.488508 | 0.764998 | -0.368715 | 0.806541 |
| 2 | 2.280906 | 0.008343 | 1.414363 | -0.887515 | -0.891688 | 0.0 | -1.696298 | 1.169781 | 1.284725 | -1.026167 | -0.961486 | 0.246200 | -0.937654 | 1.324226 | -0.057267 | -0.426230 | -0.658973 | 0.0 | -0.932014 | -0.550208 | 0.155707 | 0.338096 | -1.144294 | -1.167687 | -0.679146 | -1.155935 |
| 3 | -0.438422 | -0.429664 | 1.461466 | -0.764121 | 1.061787 | 0.0 | -1.694636 | 1.169781 | -0.486709 | 0.379672 | -0.961486 | 0.246200 | -0.763634 | -0.678049 | -1.150554 | -0.426230 | 0.266233 | 0.0 | -0.932014 | -0.421642 | 0.155707 | 0.338096 | 0.161947 | 0.764998 | 0.252146 | -1.155935 |
| 4 | -0.438422 | -1.086676 | -0.524295 | -0.887515 | -1.868426 | 0.0 | -1.691313 | -1.575686 | -1.274014 | 0.379672 | -0.961486 | -0.660853 | -0.644858 | 2.525591 | -0.877232 | -0.426230 | 1.191438 | 0.0 | 0.241988 | -0.678774 | 0.155707 | 0.338096 | -0.817734 | -0.615492 | -0.058285 | -0.595227 |
df_merge_7 = pd.concat([df_train7_dum,x_lin],axis=1)
df_merge_7
| BusinessTravel_Non-Travel | BusinessTravel_Travel_Frequently | BusinessTravel_Travel_Rarely | Department_Human Resources | Department_Research & Development | Department_Sales | EducationField_Human Resources | EducationField_Life Sciences | EducationField_Marketing | EducationField_Medical | EducationField_Other | EducationField_Technical Degree | Gender_Female | Gender_Male | JobRole_Healthcare Representative | JobRole_Human Resources | JobRole_Laboratory Technician | JobRole_Manager | JobRole_Manufacturing Director | JobRole_Research Director | JobRole_Research Scientist | JobRole_Sales Executive | JobRole_Sales Representative | MaritalStatus_Divorced | MaritalStatus_Married | MaritalStatus_Single | Over18_Y | OverTime_No | OverTime_Yes | Attrition | Age | DailyRate | DistanceFromHome | Education | EmployeeCount | EmployeeNumber | EnvironmentSatisfaction | HourlyRate | JobInvolvement | JobLevel | JobSatisfaction | MonthlyIncome | NumCompaniesWorked | PercentSalaryHike | PerformanceRating | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 2.280906 | 0.446350 | 0.742527 | -1.010909 | -0.891688 | 0.0 | -1.701283 | -0.660531 | 1.383138 | 0.379672 | -0.057788 | 1.153254 | -0.108350 | 2.125136 | -1.150554 | -0.426230 | -1.584178 | 0.0 | -0.932014 | -0.421642 | -2.171982 | -2.493820 | -0.164613 | -0.063296 | -0.679146 | 0.245834 |
| 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | -0.438422 | 1.322365 | -1.297775 | -0.147150 | -1.868426 | 0.0 | -1.699621 | 0.254625 | -0.240677 | -1.026167 | -0.057788 | -0.660853 | -0.291719 | -0.678049 | 2.129306 | 2.346151 | 1.191438 | 0.0 | 0.241988 | -0.164511 | 0.155707 | 0.338096 | 0.488508 | 0.764998 | -0.368715 | 0.806541 |
| 2 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 2.280906 | 0.008343 | 1.414363 | -0.887515 | -0.891688 | 0.0 | -1.696298 | 1.169781 | 1.284725 | -1.026167 | -0.961486 | 0.246200 | -0.937654 | 1.324226 | -0.057267 | -0.426230 | -0.658973 | 0.0 | -0.932014 | -0.550208 | 0.155707 | 0.338096 | -1.144294 | -1.167687 | -0.679146 | -1.155935 |
| 3 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | -0.438422 | -0.429664 | 1.461466 | -0.764121 | 1.061787 | 0.0 | -1.694636 | 1.169781 | -0.486709 | 0.379672 | -0.961486 | 0.246200 | -0.763634 | -0.678049 | -1.150554 | -0.426230 | 0.266233 | 0.0 | -0.932014 | -0.421642 | 0.155707 | 0.338096 | 0.161947 | 0.764998 | 0.252146 | -1.155935 |
| 4 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | -0.438422 | -1.086676 | -0.524295 | -0.887515 | -1.868426 | 0.0 | -1.691313 | -1.575686 | -1.274014 | 0.379672 | -0.961486 | -0.660853 | -0.644858 | 2.525591 | -0.877232 | -0.426230 | 1.191438 | 0.0 | 0.241988 | -0.678774 | 0.155707 | 0.338096 | -0.817734 | -0.615492 | -0.058285 | -0.595227 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1465 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | -0.438422 | -0.101159 | 0.202082 | 1.703764 | -0.891688 | 0.0 | 1.721670 | 0.254625 | -1.224807 | 1.785511 | -0.057788 | 1.153254 | -0.835451 | 0.523316 | 0.489376 | -0.426230 | 0.266233 | 0.0 | 0.241988 | 0.735447 | 0.155707 | 0.338096 | -0.327893 | -0.615492 | -0.679146 | -0.314873 |
| 1466 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | -0.438422 | 0.227347 | -0.469754 | -0.393938 | -1.868426 | 0.0 | 1.723332 | 1.169781 | -1.175601 | -1.026167 | 0.845911 | -1.567907 | 0.741140 | 0.523316 | -0.057267 | -0.426230 | -1.584178 | 0.0 | 0.241988 | -0.293077 | 1.707500 | 0.338096 | -0.001333 | 0.764998 | -0.368715 | 0.806541 |
| 1467 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | -0.438422 | -1.086676 | -1.605183 | -0.640727 | 0.085049 | 0.0 | 1.726655 | -0.660531 | 1.038693 | 1.785511 | -0.057788 | -0.660853 | -0.076690 | -0.678049 | 1.309341 | 2.346151 | -0.658973 | 0.0 | 0.241988 | -0.678774 | -2.171982 | 0.338096 | -0.164613 | -0.615492 | -0.679146 | -0.314873 |
| 1468 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | -0.438422 | 1.322365 | 0.546677 | -0.887515 | 0.085049 | 0.0 | 1.728317 | 1.169781 | -0.142264 | -1.026167 | -0.057788 | -0.660853 | -0.236474 | -0.277594 | -0.330589 | -0.426230 | 1.191438 | 0.0 | -0.932014 | 0.735447 | 0.155707 | -1.077862 | 0.325228 | 0.488900 | -0.679146 | 1.086895 |
| 1469 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | -0.438422 | -0.320163 | -0.432568 | -0.147150 | 0.085049 | 0.0 | 1.733302 | -0.660531 | 0.792660 | 1.785511 | -0.057788 | 0.246200 | -0.445978 | -0.277594 | -0.877232 | -0.426230 | -1.584178 | 0.0 | -0.932014 | -0.678774 | 0.155707 | 1.754054 | -0.491174 | -0.339394 | -0.368715 | -0.595227 |
1470 rows × 55 columns
# VIF
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif7l = pd.DataFrame()
vif7l['Features'] = df_merge_7.columns
vif7l['VIF'] = [variance_inflation_factor(df_merge_7.values,i) for i in range(df_merge_7.shape[1])]
vif7l
| Features | VIF | |
|---|---|---|
| 0 | BusinessTravel_Non-Travel | inf |
| 1 | BusinessTravel_Travel_Frequently | inf |
| 2 | BusinessTravel_Travel_Rarely | inf |
| 3 | Department_Human Resources | inf |
| 4 | Department_Research & Development | inf |
| 5 | Department_Sales | inf |
| 6 | EducationField_Human Resources | inf |
| 7 | EducationField_Life Sciences | inf |
| 8 | EducationField_Marketing | inf |
| 9 | EducationField_Medical | inf |
| 10 | EducationField_Other | inf |
| 11 | EducationField_Technical Degree | inf |
| 12 | Gender_Female | inf |
| 13 | Gender_Male | inf |
| 14 | JobRole_Healthcare Representative | inf |
| 15 | JobRole_Human Resources | inf |
| 16 | JobRole_Laboratory Technician | inf |
| 17 | JobRole_Manager | inf |
| 18 | JobRole_Manufacturing Director | inf |
| 19 | JobRole_Research Director | inf |
| 20 | JobRole_Research Scientist | inf |
| 21 | JobRole_Sales Executive | inf |
| 22 | JobRole_Sales Representative | inf |
| 23 | MaritalStatus_Divorced | inf |
| 24 | MaritalStatus_Married | inf |
| 25 | MaritalStatus_Single | inf |
| 26 | Over18_Y | 0.000000 |
| 27 | OverTime_No | inf |
| 28 | OverTime_Yes | inf |
| 29 | Attrition | 1.347205 |
| 30 | Age | 2.093292 |
| 31 | DailyRate | 1.038089 |
| 32 | DistanceFromHome | 1.031397 |
| 33 | Education | 1.084381 |
| 34 | EmployeeCount | NaN |
| 35 | EmployeeNumber | 1.035822 |
| 36 | EnvironmentSatisfaction | 1.048560 |
| 37 | HourlyRate | 1.027678 |
| 38 | JobInvolvement | 1.049355 |
| 39 | JobLevel | 14.152924 |
| 40 | JobSatisfaction | 1.043983 |
| 41 | MonthlyIncome | 18.134627 |
| 42 | NumCompaniesWorked | 1.300624 |
| 43 | PercentSalaryHike | 2.565100 |
| 44 | PerformanceRating | 2.550958 |
| 45 | RelationshipSatisfaction | 1.038662 |
| 46 | StandardHours | NaN |
| 47 | StockOptionLevel | 1.923492 |
| 48 | TotalWorkingYears | 5.019051 |
| 49 | TrainingTimesLastYear | 1.040440 |
| 50 | WorkLifeBalance | 1.034209 |
| 51 | YearsAtCompany | 4.760278 |
| 52 | YearsInCurrentRole | 2.805982 |
| 53 | YearsSinceLastPromotion | 1.729502 |
| 54 | YearsWithCurrManager | 2.856316 |
featuresdrop = vif7l.loc[vif7l['VIF']>10]
droplist = featurestodrop['Features']
droplist = list(droplist)
len(droplist)
print(droplist)
['BusinessTravel_Non-Travel', 'BusinessTravel_Travel_Frequently', 'BusinessTravel_Travel_Rarely', 'Department_Human Resources', 'Department_Research & Development', 'Department_Sales', 'EducationField_Human Resources', 'EducationField_Life Sciences', 'EducationField_Marketing', 'EducationField_Medical', 'EducationField_Other', 'EducationField_Technical Degree', 'Gender_Female', 'Gender_Male', 'JobRole_Healthcare Representative', 'JobRole_Human Resources', 'JobRole_Laboratory Technician', 'JobRole_Manager', 'JobRole_Manufacturing Director', 'JobRole_Research Director', 'JobRole_Research Scientist', 'JobRole_Sales Executive', 'JobRole_Sales Representative', 'MaritalStatus_Divorced', 'MaritalStatus_Married', 'MaritalStatus_Single', 'OverTime_No', 'OverTime_Yes']
drop_list7 = ['BusinessTravel_Non-Travel', 'BusinessTravel_Travel_Frequently', 'BusinessTravel_Travel_Rarely',
'Department_Human Resources', 'Department_Research & Development', 'Department_Sales',
'EducationField_Human Resources','EducationField_Life Sciences', 'EducationField_Marketing',
'EducationField_Medical', 'EducationField_Other','EducationField_Technical Degree', 'Gender_Female',
'Gender_Male', 'JobRole_Healthcare Representative','JobRole_Human Resources',
'JobRole_Laboratory Technician', 'JobRole_Manager', 'JobRole_Manufacturing Director',
'JobRole_Research Director', 'JobRole_Research Scientist', 'JobRole_Sales Executive',
'JobRole_Sales Representative','MaritalStatus_Divorced', 'MaritalStatus_Married','MaritalStatus_Single',
'OverTime_No', 'OverTime_Yes']
df_lin_7 = df_merge_7.drop(drop_list7,axis=1)
df_lin_7
| Over18_Y | Attrition | Age | DailyRate | DistanceFromHome | Education | EmployeeCount | EmployeeNumber | EnvironmentSatisfaction | HourlyRate | JobInvolvement | JobLevel | JobSatisfaction | MonthlyIncome | NumCompaniesWorked | PercentSalaryHike | PerformanceRating | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2.280906 | 0.446350 | 0.742527 | -1.010909 | -0.891688 | 0.0 | -1.701283 | -0.660531 | 1.383138 | 0.379672 | -0.057788 | 1.153254 | -0.108350 | 2.125136 | -1.150554 | -0.426230 | -1.584178 | 0.0 | -0.932014 | -0.421642 | -2.171982 | -2.493820 | -0.164613 | -0.063296 | -0.679146 | 0.245834 |
| 1 | 1 | -0.438422 | 1.322365 | -1.297775 | -0.147150 | -1.868426 | 0.0 | -1.699621 | 0.254625 | -0.240677 | -1.026167 | -0.057788 | -0.660853 | -0.291719 | -0.678049 | 2.129306 | 2.346151 | 1.191438 | 0.0 | 0.241988 | -0.164511 | 0.155707 | 0.338096 | 0.488508 | 0.764998 | -0.368715 | 0.806541 |
| 2 | 1 | 2.280906 | 0.008343 | 1.414363 | -0.887515 | -0.891688 | 0.0 | -1.696298 | 1.169781 | 1.284725 | -1.026167 | -0.961486 | 0.246200 | -0.937654 | 1.324226 | -0.057267 | -0.426230 | -0.658973 | 0.0 | -0.932014 | -0.550208 | 0.155707 | 0.338096 | -1.144294 | -1.167687 | -0.679146 | -1.155935 |
| 3 | 1 | -0.438422 | -0.429664 | 1.461466 | -0.764121 | 1.061787 | 0.0 | -1.694636 | 1.169781 | -0.486709 | 0.379672 | -0.961486 | 0.246200 | -0.763634 | -0.678049 | -1.150554 | -0.426230 | 0.266233 | 0.0 | -0.932014 | -0.421642 | 0.155707 | 0.338096 | 0.161947 | 0.764998 | 0.252146 | -1.155935 |
| 4 | 1 | -0.438422 | -1.086676 | -0.524295 | -0.887515 | -1.868426 | 0.0 | -1.691313 | -1.575686 | -1.274014 | 0.379672 | -0.961486 | -0.660853 | -0.644858 | 2.525591 | -0.877232 | -0.426230 | 1.191438 | 0.0 | 0.241988 | -0.678774 | 0.155707 | 0.338096 | -0.817734 | -0.615492 | -0.058285 | -0.595227 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1465 | 1 | -0.438422 | -0.101159 | 0.202082 | 1.703764 | -0.891688 | 0.0 | 1.721670 | 0.254625 | -1.224807 | 1.785511 | -0.057788 | 1.153254 | -0.835451 | 0.523316 | 0.489376 | -0.426230 | 0.266233 | 0.0 | 0.241988 | 0.735447 | 0.155707 | 0.338096 | -0.327893 | -0.615492 | -0.679146 | -0.314873 |
| 1466 | 1 | -0.438422 | 0.227347 | -0.469754 | -0.393938 | -1.868426 | 0.0 | 1.723332 | 1.169781 | -1.175601 | -1.026167 | 0.845911 | -1.567907 | 0.741140 | 0.523316 | -0.057267 | -0.426230 | -1.584178 | 0.0 | 0.241988 | -0.293077 | 1.707500 | 0.338096 | -0.001333 | 0.764998 | -0.368715 | 0.806541 |
| 1467 | 1 | -0.438422 | -1.086676 | -1.605183 | -0.640727 | 0.085049 | 0.0 | 1.726655 | -0.660531 | 1.038693 | 1.785511 | -0.057788 | -0.660853 | -0.076690 | -0.678049 | 1.309341 | 2.346151 | -0.658973 | 0.0 | 0.241988 | -0.678774 | -2.171982 | 0.338096 | -0.164613 | -0.615492 | -0.679146 | -0.314873 |
| 1468 | 1 | -0.438422 | 1.322365 | 0.546677 | -0.887515 | 0.085049 | 0.0 | 1.728317 | 1.169781 | -0.142264 | -1.026167 | -0.057788 | -0.660853 | -0.236474 | -0.277594 | -0.330589 | -0.426230 | 1.191438 | 0.0 | -0.932014 | 0.735447 | 0.155707 | -1.077862 | 0.325228 | 0.488900 | -0.679146 | 1.086895 |
| 1469 | 1 | -0.438422 | -0.320163 | -0.432568 | -0.147150 | 0.085049 | 0.0 | 1.733302 | -0.660531 | 0.792660 | 1.785511 | -0.057788 | 0.246200 | -0.445978 | -0.277594 | -0.877232 | -0.426230 | -1.584178 | 0.0 | -0.932014 | -0.678774 | 0.155707 | 1.754054 | -0.491174 | -0.339394 | -0.368715 | -0.595227 |
1470 rows × 27 columns
# Splitting the datasets
xtrain,xtest,ytrain,ytest = train_test_split(df_lin_7,y,test_size=0.25,random_state=31)
# Model Building
lin_model = LinearRegression()
lin_model.fit(xtrain,ytrain)
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LinearRegression()
# Training data evaluation
ypredtrain = lin_model.predict(xtrain)
MAE = mean_absolute_error(ytrain,ypredtrain)
print('Mean absolute error:',MAE)
MSE = mean_squared_error(ytrain,ypredtrain)
print('Mean squared error:',MSE)
RMSE = np.sqrt(MSE)
print('Root mean squared error:',RMSE)
RSquared = r2_score(ytrain,ypredtrain)
print('R-Squared:',RSquared)
AdjRsquared = 1-((1-RSquared)*(len(xtrain)-1)/(len(xtrain)-len(x_lin.columns)-1))
print('AdjRsquared:',AdjRsquared)
Mean absolute error: 6207.680039002682 Mean squared error: 50822766.188194014 Root mean squared error: 7129.008780201776 R-Squared: 0.012529315004031982 AdjRsquared: -0.011353696912149713
# testing data evaluation
ypredtest = lin_model.predict(xtest)
MAE = mean_absolute_error(ytest,ypredtest)
print('Mean absolute error:',MAE)
MSE = mean_squared_error(ytest,ypredtest)
print('Mean squared error:',MSE)
RMSE = np.sqrt(MSE)
print('Root mean squared error:',RMSE)
RSquared = r2_score(ytest,ypredtest)
print('R-Squared:',RSquared)
AdjRsquared = 1-((1-RSquared)*(len(xtest)-1)/(len(xtest)-len(x_lin.columns)-1))
print('AdjRsquared:',AdjRsquared)
Mean absolute error: 6021.015273843959 Mean squared error: 48787827.28982247 Root mean squared error: 6984.828365094053 R-Squared: -0.01442841066218259 AdjRsquared: -0.09177485839595612
lasso = Lasso()
lasso.fit(xtrain,ytrain)
Lasso()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Lasso()
# Testing Data Evaluation
ypredtest = lasso.predict(xtest)
Mean_squared_error = mean_squared_error(ytest,ypredtest)
print('Mean Squared Error:',Mean_squared_error)
r2Score = r2_score(ytest,ypredtest)
print('R2-Score:',r2Score)
Mean Squared Error: 48774725.24318157 R2-Score: -0.014155984340112893
# training data evaluation
ypredtrain = lasso.predict(xtrain)
Mean_squared_error = mean_squared_error(ytrain,ypredtrain)
print('Mean Squared Error:',Mean_squared_error)
r2Score = r2_score(ytrain,ypredtrain)
print('R2-Score:',r2Score)
Mean Squared Error: 50822852.237603806 R2-Score: 0.01252764309051757
ridge = Ridge()
ridge.fit(xtrain,ytrain)
Ridge()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Ridge()
# Testing Data Evaluation
ypredtest = ridge.predict(xtest)
Mean_squared_error = mean_squared_error(ytest,ypredtest)
print('Mean Squared Error:',Mean_squared_error)
r2Score = r2_score(ytest,ypredtest)
print('R2-Score:',r2Score)
Mean Squared Error: 48784711.04356809 R2-Score: -0.01436361563213806
#Training Data Evaluation
ypredtrain = ridge.predict(xtrain)
Mean_squared_error = mean_squared_error(ytrain,ypredtrain)
print('Mean Squared Error:',Mean_squared_error)
r2Score = r2_score(ytrain,ypredtrain)
print('R2-Score:',r2Score)
Mean Squared Error: 50822774.823542684 R2-Score: 0.012529147221868486
xtrain,xtest,ytrain,ytest = train_test_split(df_lin_7,y,test_size=0.25,random_state=31)
xtrain
| Over18_Y | Attrition | Age | DailyRate | DistanceFromHome | Education | EmployeeCount | EmployeeNumber | EnvironmentSatisfaction | HourlyRate | JobInvolvement | JobLevel | JobSatisfaction | MonthlyIncome | NumCompaniesWorked | PercentSalaryHike | PerformanceRating | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1074 | 1 | -0.438422 | -0.429664 | -0.710227 | -0.147150 | 2.038524 | 0.0 | 0.814421 | 1.169781 | 0.152975 | 0.379672 | -0.057788 | 0.246200 | -0.024420 | -0.277594 | 0.489376 | -0.42623 | -1.584178 | 0.0 | -0.932014 | 0.349751 | 2.483396 | 0.338096 | -1.144294 | -1.167687 | -0.679146 | -1.155935 |
| 1458 | 1 | -0.438422 | -0.210661 | -1.277942 | -1.010909 | 1.061787 | 0.0 | 1.706715 | 0.254625 | -0.191470 | -2.432006 | -0.961486 | 1.153254 | -0.749185 | -0.678049 | -0.877232 | -0.42623 | 1.191438 | 0.0 | 0.241988 | -0.935905 | 1.707500 | 0.338096 | -0.491174 | -0.339394 | -0.368715 | -0.875581 |
| 414 | 1 | 2.280906 | -1.415181 | 1.600296 | -1.010909 | -1.868426 | 0.0 | -0.782403 | -1.575686 | -0.191470 | 0.379672 | -0.961486 | -0.660853 | -0.701377 | -0.678049 | 0.216054 | -0.42623 | -0.658973 | 0.0 | -0.932014 | -0.678774 | 0.931603 | 0.338096 | -0.327893 | -0.339394 | -0.368715 | -0.034520 |
| 1371 | 1 | -0.438422 | 2.088878 | 1.587900 | 0.223033 | 2.038524 | 0.0 | 1.512305 | 1.169781 | 1.137106 | -1.026167 | -0.057788 | -1.567907 | -0.238599 | 0.523316 | 0.216054 | -0.42623 | 0.266233 | 0.0 | 0.241988 | -0.678774 | 0.155707 | 0.338096 | -1.144294 | -1.167687 | -0.679146 | -1.155935 |
| 1140 | 1 | -0.438422 | 0.774856 | 1.265617 | -0.270544 | 0.085049 | 0.0 | 0.968953 | -0.660531 | -1.716872 | 0.379672 | 2.653309 | 1.153254 | 2.665772 | -1.078504 | -0.330589 | -0.42623 | 1.191438 | 0.0 | 0.241988 | 1.506840 | 0.931603 | -1.077862 | 2.447870 | 0.764998 | -0.368715 | 1.647603 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 826 | 1 | -0.438422 | 0.117845 | -0.915993 | -1.010909 | 0.085049 | 0.0 | 0.211251 | 0.254625 | -1.421633 | 1.785511 | -0.961486 | 0.246200 | -0.777445 | -0.678049 | -0.603911 | -0.42623 | 1.191438 | 0.0 | 0.241988 | -0.550208 | -0.620189 | 1.754054 | -0.001333 | 0.488900 | 0.873006 | -1.155935 |
| 610 | 1 | -0.438422 | -1.086676 | -1.322566 | -0.517332 | -1.868426 | 0.0 | -0.300531 | 0.254625 | -1.175601 | -1.026167 | 0.845911 | 1.153254 | 1.339692 | -0.678049 | 0.216054 | -0.42623 | -0.658973 | 0.0 | 0.241988 | -0.293077 | 0.155707 | 0.338096 | 0.325228 | 1.041095 | -0.679146 | 1.086895 |
| 894 | 1 | -0.438422 | 1.869874 | -0.291259 | -0.764121 | 0.085049 | 0.0 | 0.374090 | 1.169781 | 0.940280 | 0.379672 | 1.749610 | 1.153254 | 2.395924 | 0.122861 | -0.330589 | -0.42623 | -1.584178 | 0.0 | -0.932014 | 3.178192 | -0.620189 | 0.338096 | 0.488508 | 1.317193 | -0.679146 | 1.367249 |
| 16 | 1 | -0.438422 | -0.539166 | -1.161424 | -0.517332 | -0.891688 | 0.0 | -1.668050 | -1.575686 | 0.694247 | 1.785511 | -0.961486 | -0.660853 | -0.680979 | -1.078504 | -0.877232 | -0.42623 | 1.191438 | 0.0 | 1.415991 | -0.550208 | 1.707500 | -1.077862 | -0.164613 | -0.615492 | -0.679146 | 0.245834 |
| 722 | 1 | -0.438422 | 0.117845 | 1.458987 | 0.099639 | -1.868426 | 0.0 | -0.031347 | 0.254625 | 0.005356 | 0.379672 | -0.961486 | 0.246200 | -0.811441 | -1.078504 | 0.489376 | -0.42623 | -0.658973 | 0.0 | 0.241988 | -1.064470 | -2.171982 | -1.077862 | -0.817734 | -0.891589 | -0.679146 | -0.595227 |
1102 rows × 27 columns
xtest
| Over18_Y | Attrition | Age | DailyRate | DistanceFromHome | Education | EmployeeCount | EmployeeNumber | EnvironmentSatisfaction | HourlyRate | JobInvolvement | JobLevel | JobSatisfaction | MonthlyIncome | NumCompaniesWorked | PercentSalaryHike | PerformanceRating | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1343 | 1 | -0.438422 | -0.867672 | -0.521816 | -0.270544 | 0.085049 | 0.0 | 1.425900 | 1.169781 | -0.339090 | 0.379672 | -0.961486 | -1.567907 | -0.943603 | 0.122861 | -0.330589 | -0.426230 | -0.658973 | 0.0 | -0.932014 | -0.035946 | -0.620189 | 0.338096 | -0.654454 | -0.615492 | -0.368715 | -0.595227 |
| 334 | 1 | -0.438422 | 0.884358 | -0.628417 | -0.147150 | 1.061787 | 0.0 | -0.951889 | 1.169781 | 0.448214 | 0.379672 | -0.057788 | 1.153254 | -0.596200 | 2.525591 | -0.330589 | -0.426230 | -1.584178 | 0.0 | 1.415991 | 0.092620 | 0.155707 | 0.338096 | 0.488508 | 1.317193 | 2.114728 | 1.086895 |
| 1136 | 1 | 2.280906 | -0.977174 | -1.173819 | 1.827158 | 0.085049 | 0.0 | 0.962306 | 0.254625 | -0.732742 | 0.379672 | -0.961486 | -0.660853 | -0.870085 | -0.678049 | 0.489376 | -0.426230 | 0.266233 | 0.0 | 2.589994 | -1.321601 | 0.155707 | 0.338096 | -0.981014 | -0.891589 | -0.679146 | -1.155935 |
| 1080 | 1 | -0.438422 | 0.993860 | -1.424209 | -0.764121 | 0.085049 | 0.0 | 0.834361 | 0.254625 | -0.732742 | 0.379672 | 1.749610 | -0.660853 | 2.146686 | 2.125136 | -0.877232 | -0.426230 | 1.191438 | 0.0 | 0.241988 | 1.506840 | -0.620189 | 1.754054 | 0.978348 | 2.145487 | 0.873006 | -0.875581 |
| 396 | 1 | -0.438422 | 0.665354 | 1.662273 | -0.147150 | 1.061787 | 0.0 | -0.828928 | 0.254625 | 0.399008 | 0.379672 | -0.057788 | 0.246200 | -0.420906 | 0.523316 | -0.330589 | -0.426230 | 1.191438 | 0.0 | -0.932014 | -0.421642 | 0.155707 | 0.338096 | -0.327893 | -0.615492 | -0.679146 | -0.595227 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 808 | 1 | -0.438422 | -0.867672 | 0.754922 | 2.320735 | 1.061787 | 0.0 | 0.158078 | 0.254625 | 1.333932 | 0.379672 | -0.961486 | 1.153254 | -0.847563 | 0.523316 | 1.855984 | 2.346151 | -1.584178 | 0.0 | 0.241988 | -0.035946 | -1.396086 | 0.338096 | -0.001333 | 0.212802 | -0.368715 | 0.806541 |
| 1445 | 1 | -0.438422 | 0.446350 | -0.546607 | 2.320735 | 1.061787 | 0.0 | 1.676806 | -1.575686 | -0.289883 | -1.026167 | 1.749610 | -0.660853 | 1.501601 | -1.078504 | 2.129306 | 2.346151 | 0.266233 | 0.0 | 0.241988 | 1.249709 | 0.155707 | 0.338096 | 2.121310 | 0.764998 | -0.679146 | 1.647603 |
| 1298 | 1 | 2.280906 | 0.993860 | -1.342398 | 1.456975 | -0.891688 | 0.0 | 1.322879 | 1.169781 | 0.005356 | 0.379672 | -0.057788 | -0.660853 | 0.514850 | 0.523316 | 1.855984 | 2.346151 | 1.191438 | 0.0 | 0.241988 | 0.221185 | -0.620189 | 1.754054 | 0.325228 | 0.764998 | 0.252146 | 0.806541 |
| 29 | 1 | -0.438422 | 0.993860 | -0.241677 | -0.887515 | 1.061787 | 0.0 | -1.639803 | -0.660531 | 0.841867 | 0.379672 | 2.653309 | -1.567907 | 2.644099 | 0.122861 | -0.877232 | -0.426230 | 1.191438 | 0.0 | -0.932014 | 1.378275 | -0.620189 | -1.077862 | -0.817734 | -0.615492 | -0.058285 | -0.875581 |
| 867 | 1 | -0.438422 | 1.431867 | 1.533360 | -0.887515 | 0.085049 | 0.0 | 0.315933 | 1.169781 | -1.766079 | 0.379672 | 1.749610 | -1.567907 | 2.412285 | -0.277594 | 1.855984 | 2.346151 | 0.266233 | 0.0 | 0.241988 | 2.663930 | 0.155707 | 0.338096 | -0.817734 | -0.615492 | -0.058285 | -0.595227 |
368 rows × 27 columns
ytrain
| MonthlyRate | |
|---|---|
| 1074 | 22049 |
| 1458 | 8952 |
| 414 | 21972 |
| 1371 | 20328 |
| 1140 | 3549 |
| ... | ... |
| 826 | 6004 |
| 610 | 8842 |
| 894 | 23474 |
| 16 | 15053 |
| 722 | 12127 |
1102 rows × 1 columns
knn_model = KNeighborsRegressor()
knn_model.fit(xtrain,ytrain)
KNeighborsRegressor()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
KNeighborsRegressor()
# Training data evaluation
ypredtrain = knn_model.predict(xtrain)
MAE = mean_absolute_error(ytrain,ypredtrain)
print('MAE:', MAE)
MSE = mean_squared_error(ytrain,ypredtrain)
print('MSE:',MSE)
RMSE = np.sqrt(MSE)
print('RMSE:',RMSE)
RS = r2_score(ytrain,ypredtrain)
print('RS:',RS)
ADJS = 1-((1*RS)*(len(xtrain)-1)/(len(xtrain)-len(x_lin.columns)-1))
print('ADJS:',ADJS)
MAE: 5516.821052631579 MSE: 42177305.469255894 RMSE: 6494.405705625103 RS: 0.18050795250327756 ADJS: 0.8151262737617595
# testing data evaluation
ypredtest = knn_model.predict(xtest)
MAE = mean_absolute_error(ytest,ypredtest)
print('MAE:', MAE)
MSE = mean_squared_error(ytest,ypredtest)
print('MSE:',MSE)
RMSE = np.sqrt(MSE)
print('RMSE:',RMSE)
RS = r2_score(ytest,ypredtest)
print('RS:',RS)
ADJS = 1-((1*RS)*(len(xtest)-1)/(len(xtest)-len(x_lin.columns)-1))
print('ADJS:',ADJS)
MAE: 6257.771739130435 MSE: 56677783.95021739 RMSE: 7528.464913262025 RS: -0.17848154932014415 ADJS: 1.1920901131979265
min_max_scaler = MinMaxScaler()
array = min_max_scaler.fit_transform(x_lin)
df_norm = pd.DataFrame(array,columns=x_lin.columns)
df_norm
| Attrition | Age | DailyRate | DistanceFromHome | Education | EmployeeCount | EmployeeNumber | EnvironmentSatisfaction | HourlyRate | JobInvolvement | JobLevel | JobSatisfaction | MonthlyIncome | NumCompaniesWorked | PercentSalaryHike | PerformanceRating | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.0 | 0.547619 | 0.715820 | 0.000000 | 0.25 | 0.0 | 0.000000 | 0.333333 | 0.914286 | 0.666667 | 0.25 | 1.000000 | 0.262454 | 0.888889 | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.200 | 0.000000 | 0.000000 | 0.150 | 0.222222 | 0.000000 | 0.294118 |
| 1 | 0.0 | 0.738095 | 0.126700 | 0.250000 | 0.00 | 0.0 | 0.000484 | 0.666667 | 0.442857 | 0.333333 | 0.25 | 0.333333 | 0.217009 | 0.111111 | 0.857143 | 1.0 | 1.000000 | 0.0 | 0.333333 | 0.250 | 0.500000 | 0.666667 | 0.250 | 0.388889 | 0.066667 | 0.411765 |
| 2 | 1.0 | 0.452381 | 0.909807 | 0.035714 | 0.25 | 0.0 | 0.001451 | 1.000000 | 0.885714 | 0.333333 | 0.00 | 0.666667 | 0.056925 | 0.666667 | 0.285714 | 0.0 | 0.333333 | 0.0 | 0.000000 | 0.175 | 0.500000 | 0.666667 | 0.000 | 0.000000 | 0.000000 | 0.000000 |
| 3 | 0.0 | 0.357143 | 0.923407 | 0.071429 | 0.75 | 0.0 | 0.001935 | 1.000000 | 0.371429 | 0.666667 | 0.00 | 0.666667 | 0.100053 | 0.111111 | 0.000000 | 0.0 | 0.666667 | 0.0 | 0.000000 | 0.200 | 0.500000 | 0.666667 | 0.200 | 0.388889 | 0.200000 | 0.000000 |
| 4 | 0.0 | 0.214286 | 0.350036 | 0.035714 | 0.00 | 0.0 | 0.002903 | 0.000000 | 0.142857 | 0.666667 | 0.00 | 0.333333 | 0.129489 | 1.000000 | 0.071429 | 0.0 | 1.000000 | 0.0 | 0.333333 | 0.150 | 0.500000 | 0.666667 | 0.050 | 0.111111 | 0.133333 | 0.117647 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1465 | 0.0 | 0.428571 | 0.559771 | 0.785714 | 0.25 | 0.0 | 0.996613 | 0.666667 | 0.157143 | 1.000000 | 0.25 | 1.000000 | 0.082254 | 0.444444 | 0.428571 | 0.0 | 0.666667 | 0.0 | 0.333333 | 0.425 | 0.500000 | 0.666667 | 0.125 | 0.111111 | 0.000000 | 0.176471 |
| 1466 | 0.0 | 0.500000 | 0.365784 | 0.178571 | 0.00 | 0.0 | 0.997097 | 1.000000 | 0.171429 | 0.333333 | 0.50 | 0.000000 | 0.472986 | 0.444444 | 0.285714 | 0.0 | 0.000000 | 0.0 | 0.333333 | 0.225 | 0.833333 | 0.666667 | 0.175 | 0.388889 | 0.066667 | 0.411765 |
| 1467 | 0.0 | 0.214286 | 0.037938 | 0.107143 | 0.50 | 0.0 | 0.998065 | 0.333333 | 0.814286 | 1.000000 | 0.25 | 0.333333 | 0.270300 | 0.111111 | 0.642857 | 1.0 | 0.333333 | 0.0 | 0.333333 | 0.150 | 0.000000 | 0.666667 | 0.150 | 0.111111 | 0.000000 | 0.176471 |
| 1468 | 0.0 | 0.738095 | 0.659270 | 0.035714 | 0.50 | 0.0 | 0.998549 | 1.000000 | 0.471429 | 0.333333 | 0.25 | 0.333333 | 0.230700 | 0.222222 | 0.214286 | 0.0 | 1.000000 | 0.0 | 0.000000 | 0.425 | 0.500000 | 0.333333 | 0.225 | 0.333333 | 0.000000 | 0.470588 |
| 1469 | 0.0 | 0.380952 | 0.376521 | 0.250000 | 0.50 | 0.0 | 1.000000 | 0.333333 | 0.742857 | 1.000000 | 0.25 | 0.666667 | 0.178778 | 0.222222 | 0.071429 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.150 | 0.500000 | 1.000000 | 0.100 | 0.166667 | 0.066667 | 0.117647 |
1470 rows × 26 columns
xtrain,xtest,ytrain,ytest = train_test_split(df_norm,y,test_size=0.25,random_state=25)
knn_model = KNeighborsRegressor()
hyp_grid = {'n_neighbors':np.arange(1,40),'p':[1,2]}
gscv_knn_model = GridSearchCV(knn_model,hyp_grid,cv=5)
gscv_knn_model.fit(xtrain,ytrain)
gscv_knn_model.best_estimator_
KNeighborsRegressor(n_neighbors=37)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
KNeighborsRegressor(n_neighbors=37)
knn_model = KNeighborsRegressor(n_neighbors=37, p=1)
knn_model.fit(xtrain,ytrain)
KNeighborsRegressor(n_neighbors=37, p=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
KNeighborsRegressor(n_neighbors=37, p=1)
# Testing data evaluation
ypredtest = knn_model.predict(xtest)
MAE = mean_absolute_error(ytest,ypredtest)
print('MAE:',MAE)
MSE = mean_squared_error(ytest,ypredtest)
print('MSE:',MSE)
RMSE = np.sqrt(MSE)
print('RMSE',RMSE)
RS = r2_score(ytest,ypredtest)
print('RS:',RS)
ADJS = 1-((1-RS)*(len(xtest)-1)/(len(xtest)-len(x_lin.columns)-1))
print('ADJS:',ADJS)
MAE: 6245.193375440658 MSE: 51160933.41318441 RMSE 7152.687146323709 RS: -0.022467704993947013 ADJS: -0.10042711945096339
xtrain,xtest,ytraon,ytest = train_test_split(df_lin_7,y,test_size=0.25,random_state=31)
# Model building
dt_reg = DecisionTreeRegressor()
dt_reg.fit(xtrain,ytrain)
DecisionTreeRegressor()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
DecisionTreeRegressor()
# training data evaluation
ypredtrain = dt_reg.predict(xtrain)
mse = mean_squared_error(ytrain, ypredtrain)
print("MSE :",mse)
rmse = np.sqrt(mse)
print("RMSE :",rmse)
r2 = r2_score(ytrain, ypredtrain)
print("R2 :",r2)
MSE : 0.0 RMSE : 0.0 R2 : 1.0
# testing data evaluation
ypred = dt_reg.predict(xtest)
mse = mean_squared_error(ytest, ypred)
print("MSE :",mse)
rmse = np.sqrt(mse)
print("RMSE :",rmse)
r2 = r2_score(ytest, ypred)
print("R2 :",r2)
MSE : 96889957.37228261 RMSE : 9843.269648459429 R2 : -1.014599356565217
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Lasso, Ridge, LogisticRegression
from sklearn.neighbors import KNeighborsRegressor,KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.cluster import KMeans
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report,recall_score,f1_score,precision_score
import tensorflow as tf
import keras
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
df_log = pd.read_csv('G:\ETLHive Pune\Project\HR-Data Atrition Project\HR-Employee-Attrition-Table 1.csv')
df_log.head()
| Attrition | Age | BusinessTravel | DailyRate | Department | DistanceFromHome | Education | EducationField | EmployeeCount | EmployeeNumber | EnvironmentSatisfaction | Gender | HourlyRate | JobInvolvement | JobLevel | JobRole | JobSatisfaction | MaritalStatus | MonthlyIncome | MonthlyRate | NumCompaniesWorked | Over18 | OverTime | PercentSalaryHike | PerformanceRating | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 41 | Travel_Rarely | 1102 | Sales | 1 | 2 | Life Sciences | 1 | 1 | 2 | Female | 94 | 3 | 2 | Sales Executive | 4 | Single | 5993 | 19479 | 8 | Y | Yes | 11 | 3 | 1 | 80 | 0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 |
| 1 | 0 | 49 | Travel_Frequently | 279 | Research & Development | 8 | 1 | Life Sciences | 1 | 2 | 3 | Male | 61 | 2 | 2 | Research Scientist | 2 | Married | 5130 | 24907 | 1 | Y | No | 23 | 4 | 4 | 80 | 1 | 10 | 3 | 3 | 10 | 7 | 1 | 7 |
| 2 | 1 | 37 | Travel_Rarely | 1373 | Research & Development | 2 | 2 | Other | 1 | 4 | 4 | Male | 92 | 2 | 1 | Laboratory Technician | 3 | Single | 2090 | 2396 | 6 | Y | Yes | 15 | 3 | 2 | 80 | 0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 |
| 3 | 0 | 33 | Travel_Frequently | 1392 | Research & Development | 3 | 4 | Life Sciences | 1 | 5 | 4 | Female | 56 | 3 | 1 | Research Scientist | 3 | Married | 2909 | 23159 | 1 | Y | Yes | 11 | 3 | 3 | 80 | 0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 |
| 4 | 0 | 27 | Travel_Rarely | 591 | Research & Development | 2 | 1 | Medical | 1 | 7 | 1 | Male | 40 | 3 | 1 | Laboratory Technician | 2 | Married | 3468 | 16632 | 9 | Y | No | 12 | 3 | 4 | 80 | 1 | 6 | 3 | 3 | 2 | 2 | 2 | 2 |
import pandas as pd
from google.colab import files
uploaded = files.upload()
import io
df_log = pd.read_csv(io.BytesIO(uploaded['HR-Employee-Attrition-Table 1.csv']))
df_log.head()
Saving HR-Employee-Attrition-Table 1.csv to HR-Employee-Attrition-Table 1 (2).csv
| Attrition | Age | BusinessTravel | DailyRate | Department | DistanceFromHome | Education | EducationField | EmployeeCount | EmployeeNumber | ... | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 41 | Travel_Rarely | 1102 | Sales | 1 | 2 | Life Sciences | 1 | 1 | ... | 1 | 80 | 0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 |
| 1 | 0 | 49 | Travel_Frequently | 279 | Research & Development | 8 | 1 | Life Sciences | 1 | 2 | ... | 4 | 80 | 1 | 10 | 3 | 3 | 10 | 7 | 1 | 7 |
| 2 | 1 | 37 | Travel_Rarely | 1373 | Research & Development | 2 | 2 | Other | 1 | 4 | ... | 2 | 80 | 0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 |
| 3 | 0 | 33 | Travel_Frequently | 1392 | Research & Development | 3 | 4 | Life Sciences | 1 | 5 | ... | 3 | 80 | 0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 |
| 4 | 0 | 27 | Travel_Rarely | 591 | Research & Development | 2 | 1 | Medical | 1 | 7 | ... | 4 | 80 | 1 | 6 | 3 | 3 | 2 | 2 | 2 | 2 |
5 rows × 35 columns
y_log = df_log['Attrition']
# Seprate Categorical and continious columns
cat = []
con = []
for i in df_log.columns:
if (df_log[i].dtypes == 'object'):
cat.append(i)
else:
con.append(i)
df_train_cat = cat
df_train_cat = df_log[cat]
df_train_cat
| BusinessTravel | Department | EducationField | Gender | JobRole | MaritalStatus | Over18 | OverTime | |
|---|---|---|---|---|---|---|---|---|
| 0 | Travel_Rarely | Sales | Life Sciences | Female | Sales Executive | Single | Y | Yes |
| 1 | Travel_Frequently | Research & Development | Life Sciences | Male | Research Scientist | Married | Y | No |
| 2 | Travel_Rarely | Research & Development | Other | Male | Laboratory Technician | Single | Y | Yes |
| 3 | Travel_Frequently | Research & Development | Life Sciences | Female | Research Scientist | Married | Y | Yes |
| 4 | Travel_Rarely | Research & Development | Medical | Male | Laboratory Technician | Married | Y | No |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1465 | Travel_Frequently | Research & Development | Medical | Male | Laboratory Technician | Married | Y | No |
| 1466 | Travel_Rarely | Research & Development | Medical | Male | Healthcare Representative | Married | Y | No |
| 1467 | Travel_Rarely | Research & Development | Life Sciences | Male | Manufacturing Director | Married | Y | Yes |
| 1468 | Travel_Frequently | Sales | Medical | Male | Sales Executive | Married | Y | No |
| 1469 | Travel_Rarely | Research & Development | Medical | Male | Laboratory Technician | Married | Y | No |
1470 rows × 8 columns
df_train_con = con
df_train_con=df_log[con]
df_train_con
| Attrition | Age | DailyRate | DistanceFromHome | Education | EmployeeCount | EmployeeNumber | EnvironmentSatisfaction | HourlyRate | JobInvolvement | ... | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 41 | 1102 | 1 | 2 | 1 | 1 | 2 | 94 | 3 | ... | 1 | 80 | 0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 |
| 1 | 0 | 49 | 279 | 8 | 1 | 1 | 2 | 3 | 61 | 2 | ... | 4 | 80 | 1 | 10 | 3 | 3 | 10 | 7 | 1 | 7 |
| 2 | 1 | 37 | 1373 | 2 | 2 | 1 | 4 | 4 | 92 | 2 | ... | 2 | 80 | 0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 |
| 3 | 0 | 33 | 1392 | 3 | 4 | 1 | 5 | 4 | 56 | 3 | ... | 3 | 80 | 0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 |
| 4 | 0 | 27 | 591 | 2 | 1 | 1 | 7 | 1 | 40 | 3 | ... | 4 | 80 | 1 | 6 | 3 | 3 | 2 | 2 | 2 | 2 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1465 | 0 | 36 | 884 | 23 | 2 | 1 | 2061 | 3 | 41 | 4 | ... | 3 | 80 | 1 | 17 | 3 | 3 | 5 | 2 | 0 | 3 |
| 1466 | 0 | 39 | 613 | 6 | 1 | 1 | 2062 | 4 | 42 | 2 | ... | 1 | 80 | 1 | 9 | 5 | 3 | 7 | 7 | 1 | 7 |
| 1467 | 0 | 27 | 155 | 4 | 3 | 1 | 2064 | 2 | 87 | 4 | ... | 2 | 80 | 1 | 6 | 0 | 3 | 6 | 2 | 0 | 3 |
| 1468 | 0 | 49 | 1023 | 2 | 3 | 1 | 2065 | 4 | 63 | 2 | ... | 4 | 80 | 0 | 17 | 3 | 2 | 9 | 6 | 0 | 8 |
| 1469 | 0 | 34 | 628 | 8 | 3 | 1 | 2068 | 2 | 82 | 4 | ... | 1 | 80 | 0 | 6 | 3 | 4 | 4 | 3 | 1 | 2 |
1470 rows × 27 columns
# Removing Outliers
# Visualisation of continious columns
plt.figure(figsize=(15,19))
for x1,i in enumerate(df_train_con.columns):
if df_train_con[i].dtypes == 'int64' or df_train_con[i].dtypes == 'float64':
plt.subplot(9,3,x1+1)
sns.boxplot(df_train_con[i])
for i in df_train_con.columns:
q1 = df_train_con[i].quantile(0.25)
q3 = df_train_con[i].quantile(0.75)
IQR = q3-q1
uppertail = q3+1.5*IQR
lowertail = q1-1.5*IQR
df_train_con.loc[(df_train_con[i]>uppertail)|(df_train_con[i]<lowertail)]
mean_1= df_train_con[i].mean()
df_train_con.loc[(df_train_con[i]>uppertail)| (df_train_con[i]<lowertail),i]=mean_1
plt.figure(figsize=(15,19))
for x1,i in enumerate(df_train_con.columns):
if df_train_con[i].dtypes=='int64' or df_train_con[i].dtypes=='float64':
plt.subplot(9,3,x1+1)
sns.boxplot(df_train_con[i])
# One hot Encoding
df_log_dum = pd.get_dummies(df_train_cat)
df_log_dum
| BusinessTravel_Non-Travel | BusinessTravel_Travel_Frequently | BusinessTravel_Travel_Rarely | Department_Human Resources | Department_Research & Development | Department_Sales | EducationField_Human Resources | EducationField_Life Sciences | EducationField_Marketing | EducationField_Medical | ... | JobRole_Research Director | JobRole_Research Scientist | JobRole_Sales Executive | JobRole_Sales Representative | MaritalStatus_Divorced | MaritalStatus_Married | MaritalStatus_Single | Over18_Y | OverTime_No | OverTime_Yes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 1 |
| 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 0 |
| 2 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 |
| 3 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 |
| 4 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1465 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 0 |
| 1466 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 0 |
| 1467 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 |
| 1468 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 |
| 1469 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 0 |
1470 rows × 29 columns
# Standardisation
std_scaler = StandardScaler()
std_scaler1 = std_scaler.fit_transform(df_train_con)
x_log = pd.DataFrame(std_scaler1,columns=df_train_con.columns)
x_log.shape
(1470, 27)
df_final_log = pd.concat([df_log_dum,x_log],axis=1)
df_final_log.columns
Index(['BusinessTravel_Non-Travel', 'BusinessTravel_Travel_Frequently',
'BusinessTravel_Travel_Rarely', 'Department_Human Resources',
'Department_Research & Development', 'Department_Sales',
'EducationField_Human Resources', 'EducationField_Life Sciences',
'EducationField_Marketing', 'EducationField_Medical',
'EducationField_Other', 'EducationField_Technical Degree',
'Gender_Female', 'Gender_Male', 'JobRole_Healthcare Representative',
'JobRole_Human Resources', 'JobRole_Laboratory Technician',
'JobRole_Manager', 'JobRole_Manufacturing Director',
'JobRole_Research Director', 'JobRole_Research Scientist',
'JobRole_Sales Executive', 'JobRole_Sales Representative',
'MaritalStatus_Divorced', 'MaritalStatus_Married',
'MaritalStatus_Single', 'Over18_Y', 'OverTime_No', 'OverTime_Yes',
'Attrition', 'Age', 'DailyRate', 'DistanceFromHome', 'Education',
'EmployeeCount', 'EmployeeNumber', 'EnvironmentSatisfaction',
'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobSatisfaction',
'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction',
'StandardHours', 'StockOptionLevel', 'TotalWorkingYears',
'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany',
'YearsInCurrentRole', 'YearsSinceLastPromotion',
'YearsWithCurrManager'],
dtype='object')
# Checking Linearity
noncorrelation = [column for column in df_final_log if abs(df_final_log[column].corr(df_final_log['Attrition']))<0.05]
corr_matrix = df_final_log.drop(noncorrelation,axis=1)
corr_matrix.head()
| BusinessTravel_Non-Travel | BusinessTravel_Travel_Frequently | Department_Research & Development | Department_Sales | EducationField_Marketing | EducationField_Technical Degree | JobRole_Healthcare Representative | JobRole_Laboratory Technician | JobRole_Manager | JobRole_Manufacturing Director | ... | JobLevel | JobSatisfaction | MonthlyIncome | StandardHours | StockOptionLevel | TotalWorkingYears | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | -0.057788 | 1.153254 | 0.129018 | 0.0 | -1.018674 | -0.374906 | -2.493820 | 0.041137 | -0.018341 | 0.294570 |
| 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | -0.057788 | -0.660853 | -0.140791 | 0.0 | 0.510149 | -0.057867 | 0.338096 | 1.069787 | 0.882230 | 0.888852 |
| 2 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | -0.961486 | 0.246200 | -1.091220 | 0.0 | -1.018674 | -0.533426 | 0.338096 | -1.501837 | -1.219103 | -1.191138 |
| 3 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | -0.961486 | 0.246200 | -0.835167 | 0.0 | -1.018674 | -0.374906 | 0.338096 | 0.555462 | 0.882230 | -1.191138 |
| 4 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | -0.961486 | -0.660853 | -0.660400 | 0.0 | 0.510149 | -0.691946 | 0.338096 | -0.987512 | -0.618722 | -0.596855 |
5 rows × 35 columns
# Check Multicolinearity
log7 = corr_matrix.drop('Attrition',axis=1)
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif_log = pd.DataFrame()
vif_log['Features'] =log7.columns
vif_log['VIF'] = [variance_inflation_factor(log7.values,i) for i in range(log7.shape[1])]
vif_log
| Features | VIF | |
|---|---|---|
| 0 | BusinessTravel_Non-Travel | 1.046263 |
| 1 | BusinessTravel_Travel_Frequently | 1.042999 |
| 2 | Department_Research & Development | 6.451867 |
| 3 | Department_Sales | 6.756041 |
| 4 | EducationField_Marketing | 1.433720 |
| 5 | EducationField_Technical Degree | 1.030463 |
| 6 | JobRole_Healthcare Representative | 1.765789 |
| 7 | JobRole_Laboratory Technician | 1.517876 |
| 8 | JobRole_Manager | 2.566583 |
| 9 | JobRole_Manufacturing Director | 1.823432 |
| 10 | JobRole_Research Director | 2.516362 |
| 11 | JobRole_Sales Representative | 1.507687 |
| 12 | MaritalStatus_Divorced | inf |
| 13 | MaritalStatus_Married | inf |
| 14 | MaritalStatus_Single | inf |
| 15 | Over18_Y | 0.000000 |
| 16 | OverTime_No | inf |
| 17 | OverTime_Yes | inf |
| 18 | Age | 1.607862 |
| 19 | DailyRate | 1.025251 |
| 20 | DistanceFromHome | 1.019110 |
| 21 | EmployeeCount | NaN |
| 22 | EnvironmentSatisfaction | 1.028213 |
| 23 | JobInvolvement | 1.017833 |
| 24 | JobLevel | 6.005247 |
| 25 | JobSatisfaction | 1.015843 |
| 26 | MonthlyIncome | 2.618679 |
| 27 | StandardHours | NaN |
| 28 | StockOptionLevel | 2.128814 |
| 29 | TotalWorkingYears | 2.575361 |
| 30 | WorkLifeBalance | 1.021160 |
| 31 | YearsAtCompany | 4.451725 |
| 32 | YearsInCurrentRole | 3.287478 |
| 33 | YearsWithCurrManager | 2.866381 |
featurestodrop = vif_log.loc[vif_log['VIF']>10]
droplist = featurestodrop['Features']
droplist = list(droplist)
len(droplist)
print(droplist)
['MaritalStatus_Divorced', 'MaritalStatus_Married', 'MaritalStatus_Single', 'OverTime_No', 'OverTime_Yes']
final_log=log7.drop(droplist,axis=1)
final_log
| BusinessTravel_Non-Travel | BusinessTravel_Travel_Frequently | Department_Research & Development | Department_Sales | EducationField_Marketing | EducationField_Technical Degree | JobRole_Healthcare Representative | JobRole_Laboratory Technician | JobRole_Manager | JobRole_Manufacturing Director | ... | JobLevel | JobSatisfaction | MonthlyIncome | StandardHours | StockOptionLevel | TotalWorkingYears | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | -0.057788 | 1.153254 | 0.129018 | 0.0 | -1.018674 | -0.374906 | -2.493820 | 0.041137 | -0.018341 | 0.294570 |
| 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | -0.057788 | -0.660853 | -0.140791 | 0.0 | 0.510149 | -0.057867 | 0.338096 | 1.069787 | 0.882230 | 0.888852 |
| 2 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | -0.961486 | 0.246200 | -1.091220 | 0.0 | -1.018674 | -0.533426 | 0.338096 | -1.501837 | -1.219103 | -1.191138 |
| 3 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | -0.961486 | 0.246200 | -0.835167 | 0.0 | -1.018674 | -0.374906 | 0.338096 | 0.555462 | 0.882230 | -1.191138 |
| 4 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | -0.961486 | -0.660853 | -0.660400 | 0.0 | 0.510149 | -0.691946 | 0.338096 | -0.987512 | -0.618722 | -0.596855 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1465 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | -0.057788 | 1.153254 | -0.940839 | 0.0 | 0.510149 | 1.051772 | 0.338096 | -0.216025 | -0.618722 | -0.299713 |
| 1466 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0.845911 | -1.567907 | 1.378958 | 0.0 | 0.510149 | -0.216386 | 0.338096 | 0.298300 | 0.882230 | 0.888852 |
| 1467 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | ... | -0.057788 | -0.660853 | 0.175602 | 0.0 | 0.510149 | -0.691946 | 0.338096 | 0.041137 | -0.618722 | -0.299713 |
| 1468 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | -0.057788 | -0.660853 | -0.059504 | 0.0 | -1.018674 | 1.051772 | -1.077862 | 0.812625 | 0.582040 | 1.185994 |
| 1469 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | -0.057788 | 0.246200 | -0.367768 | 0.0 | -1.018674 | -0.691946 | 1.754054 | -0.473188 | -0.318532 | -0.596855 |
1470 rows × 29 columns
# Spliiting
xtrain,xtest,ytrain,ytest = train_test_split(final_log,y_log,test_size=0.25,random_state=31,stratify=y_log)
xtrain
| BusinessTravel_Non-Travel | BusinessTravel_Travel_Frequently | Department_Research & Development | Department_Sales | EducationField_Marketing | EducationField_Technical Degree | JobRole_Healthcare Representative | JobRole_Laboratory Technician | JobRole_Manager | JobRole_Manufacturing Director | ... | JobLevel | JobSatisfaction | MonthlyIncome | StandardHours | StockOptionLevel | TotalWorkingYears | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1069 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | -0.961486 | 0.246200 | -1.255982 | 0.0 | 0.510149 | -1.484545 | -2.493820 | -1.244675 | -1.219103 | -1.191138 |
| 841 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | -0.961486 | -0.660853 | -0.620070 | 0.0 | -1.018674 | -0.691946 | 0.338096 | -0.473188 | -0.318532 | -0.596855 |
| 332 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | -0.057788 | 0.246200 | -0.222390 | 0.0 | -1.018674 | 1.527331 | -1.077862 | -0.473188 | -0.318532 | -0.299713 |
| 1337 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | -0.961486 | -0.660853 | -0.851737 | 0.0 | 0.510149 | -1.484545 | 0.338096 | -1.244675 | -1.219103 | -1.191138 |
| 292 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | ... | -0.961486 | -0.660853 | -0.872684 | 0.0 | 0.510149 | -1.326025 | -1.077862 | -0.987512 | -0.618722 | -0.596855 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 48 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | ... | -0.057788 | 1.153254 | 0.059925 | 0.0 | -1.018674 | 0.576212 | 0.338096 | 0.812625 | 0.582040 | 1.185994 |
| 220 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | -0.057788 | -0.660853 | 0.104320 | 0.0 | -1.018674 | 0.893252 | 1.754054 | 1.841275 | 2.082992 | 0.888852 |
| 551 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | -0.057788 | -0.660853 | 0.252824 | 0.0 | 0.510149 | 0.259173 | -2.493820 | 0.555462 | -0.318532 | 0.591711 |
| 1435 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | -0.961486 | 1.153254 | -0.983046 | 0.0 | -1.018674 | -0.691946 | 0.338096 | -0.473188 | -0.318532 | -0.596855 |
| 1379 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | -0.961486 | -0.660853 | -0.849548 | 0.0 | -1.018674 | -1.484545 | 0.338096 | -1.244675 | -1.219103 | -1.191138 |
1102 rows × 29 columns
xtest
| BusinessTravel_Non-Travel | BusinessTravel_Travel_Frequently | Department_Research & Development | Department_Sales | EducationField_Marketing | EducationField_Technical Degree | JobRole_Healthcare Representative | JobRole_Laboratory Technician | JobRole_Manager | JobRole_Manufacturing Director | ... | JobLevel | JobSatisfaction | MonthlyIncome | StandardHours | StockOptionLevel | TotalWorkingYears | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 302 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | -0.057788 | -1.567907 | 0.025222 | 0.0 | -1.018674 | -0.216386 | 0.338096 | 0.555462 | -0.318532 | 0.888852 |
| 200 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | ... | -0.057788 | -1.567907 | -0.400908 | 0.0 | 0.510149 | -0.691946 | 0.338096 | -0.987512 | -0.618722 | -1.191138 |
| 944 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | -0.057788 | 1.153254 | 0.341927 | 0.0 | 0.195024 | -0.057867 | 0.338096 | 0.812625 | 1.182421 | 0.294570 |
| 929 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | -0.961486 | 1.153254 | -0.535657 | 0.0 | 0.510149 | -1.326025 | 0.338096 | -0.987512 | -0.618722 | -0.596855 |
| 1047 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | -0.057788 | -1.567907 | -0.377460 | 0.0 | 2.038972 | -0.850466 | 0.338096 | -0.473188 | -0.318532 | -0.299713 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1310 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 1.749610 | 0.246200 | 3.191026 | 0.0 | -1.018674 | 2.002890 | 0.338096 | -0.987512 | -0.618722 | -0.596855 |
| 766 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 2.653309 | 0.246200 | 0.288444 | 0.0 | 0.510149 | 0.144974 | -1.077862 | 0.555462 | -0.918912 | 0.888852 |
| 1233 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | -0.961486 | 1.153254 | -0.849861 | 0.0 | 0.510149 | -0.057867 | -1.077862 | 1.069787 | -1.219103 | 1.185994 |
| 504 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | -0.057788 | -1.567907 | -0.404660 | 0.0 | 2.038972 | -0.850466 | 0.338096 | -1.244675 | -0.918912 | -1.191138 |
| 362 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | -0.961486 | 1.153254 | -0.928646 | 0.0 | -1.018674 | -1.167505 | -1.077862 | -0.730350 | -0.618722 | -0.596855 |
368 rows × 29 columns
ytrain
1069 0
841 0
332 0
1337 0
292 0
..
48 0
220 0
551 0
1435 0
1379 1
Name: Attrition, Length: 1102, dtype: int64
ytest
302 0
200 0
944 0
929 0
1047 0
..
1310 0
766 0
1233 0
504 1
362 0
Name: Attrition, Length: 368, dtype: int64
# Model Evalution
log_model = LogisticRegression()
log_model.fit(xtrain,ytrain)
LogisticRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LogisticRegression()
# Testing data evaluation
ypredtest = log_model.predict(xtest)
Accuracy = accuracy_score(ytest,ypredtest)
print('Accuracy:',Accuracy)
confusionmatrix = confusion_matrix(ytest,ypredtest)
print('Confusion matrix: \n',confusionmatrix)
Precision = precision_score(ytest,ypredtest)
print('Precision:',Precision)
Recall = recall_score(ytest,ypredtest)
print('Recall:',Recall)
F1score = f1_score(ytest,ypredtest)
print('F1score:',F1score)
Classification_report = classification_report(ytest,ypredtest)
print('Classification report:\n',Classification_report)
Accuracy: 0.8586956521739131
Confusion matrix:
[[303 6]
[ 46 13]]
Precision: 0.6842105263157895
Recall: 0.22033898305084745
F1score: 0.3333333333333333
Classification report:
precision recall f1-score support
0 0.87 0.98 0.92 309
1 0.68 0.22 0.33 59
accuracy 0.86 368
macro avg 0.78 0.60 0.63 368
weighted avg 0.84 0.86 0.83 368
# Training data evaluation
ypredtrain = log_model.predict(xtrain)
Accuracy = accuracy_score(ytrain,ypredtrain)
print('Accuracy:',Accuracy)
confusionmatrix = confusion_matrix(ytrain,ypredtrain)
print('Confusion matrix: \n',confusionmatrix)
Precision = precision_score(ytrain,ypredtrain)
print('Precision:',Precision)
Recall = recall_score(ytrain,ypredtrain)
print('Recall:',Recall)
F1score = f1_score(ytrain,ypredtrain)
print('F1score:',F1score)
Classification_report = classification_report(ytrain,ypredtrain)
print('Classification report:\n',Classification_report)
Accuracy: 0.8693284936479129
Confusion matrix:
[[912 12]
[132 46]]
Precision: 0.7931034482758621
Recall: 0.25842696629213485
F1score: 0.38983050847457634
Classification report:
precision recall f1-score support
0 0.87 0.99 0.93 924
1 0.79 0.26 0.39 178
accuracy 0.87 1102
macro avg 0.83 0.62 0.66 1102
weighted avg 0.86 0.87 0.84 1102
Knn_model = KNeighborsClassifier()
Knn_model.fit(xtrain,ytrain)
KNeighborsClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
KNeighborsClassifier()
# Testing data evaluation
ypredtest = Knn_model.predict(xtest)
Accuracy = accuracy_score(ytest,ypredtest)
print('Accuracy:',Accuracy)
Confusion_matrix = confusion_matrix(ytest,ypredtest)
print('Confusion_matrix:\n',Confusion_matrix)
Classification_report = classification_report(ytest,ypredtest)
print('Classification_report:\n',Classification_report)
Accuracy: 0.8396739130434783
Confusion_matrix:
[[300 9]
[ 50 9]]
Classification_report:
precision recall f1-score support
0 0.86 0.97 0.91 309
1 0.50 0.15 0.23 59
accuracy 0.84 368
macro avg 0.68 0.56 0.57 368
weighted avg 0.80 0.84 0.80 368
# Training data evaluation
ypredtrain = Knn_model.predict(xtrain)
Accuracy = accuracy_score(ytrain,ypredtrain)
print('Accuracy:',Accuracy)
Confusion_matrix = confusion_matrix(ytrain,ypredtrain)
print('Confusion_matrix:\n',Confusion_matrix)
Classification_report = classification_report(ytrain,ypredtrain)
print('Classification_report:\n',Classification_report)
Accuracy: 0.8702359346642469
Confusion_matrix:
[[911 13]
[130 48]]
Classification_report:
precision recall f1-score support
0 0.88 0.99 0.93 924
1 0.79 0.27 0.40 178
accuracy 0.87 1102
macro avg 0.83 0.63 0.66 1102
weighted avg 0.86 0.87 0.84 1102
from sklearn.naive_bayes import BernoulliNB
naive_model = BernoulliNB(binarize=True)
naive_model.fit(xtrain,ytrain)
BernoulliNB(binarize=True)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
BernoulliNB(binarize=True)
# Training data evaluation
ypredtrain = naive_model.predict(xtrain)
Accuracy = accuracy_score(ytrain,ypredtrain)
print('Accuracy:',Accuracy)
confusionmatrix = confusion_matrix(ytrain,ypredtrain)
print('Confusion matrix: \n',confusionmatrix)
Precision = precision_score(ytrain,ypredtrain)
print('Precision:',Precision)
Recall = recall_score(ytrain,ypredtrain)
print('Recall:',Recall)
F1score = f1_score(ytrain,ypredtrain)
print('F1score:',F1score)
Classification_report = classification_report(ytrain,ypredtrain)
print('Classification report:\n',Classification_report)
Accuracy: 0.838475499092559
Confusion matrix:
[[924 0]
[178 0]]
Precision: 0.0
Recall: 0.0
F1score: 0.0
Classification report:
precision recall f1-score support
0 0.84 1.00 0.91 924
1 0.00 0.00 0.00 178
accuracy 0.84 1102
macro avg 0.42 0.50 0.46 1102
weighted avg 0.70 0.84 0.76 1102
# Testing data evaluation
ypredtest = naive_model.predict(xtest)
Accuracy = accuracy_score(ytest,ypredtest)
print('Accuracy:',Accuracy)
confusionmatrix = confusion_matrix(ytest,ypredtest)
print('Confusion matrix: \n',confusionmatrix)
Precision = precision_score(ytest,ypredtest)
print('Precision:',Precision)
Recall = recall_score(ytest,ypredtest)
print('Recall:',Recall)
F1score = f1_score(ytest,ypredtest)
print('F1score:',F1score)
Classification_report = classification_report(ytest,ypredtest)
print('Classification report:\n',Classification_report)
Accuracy: 0.8396739130434783
Confusion matrix:
[[309 0]
[ 59 0]]
Precision: 0.0
Recall: 0.0
F1score: 0.0
Classification report:
precision recall f1-score support
0 0.84 1.00 0.91 309
1 0.00 0.00 0.00 59
accuracy 0.84 368
macro avg 0.42 0.50 0.46 368
weighted avg 0.71 0.84 0.77 368
# Model Training
dt_reg = DecisionTreeRegressor()
dt_reg.fit(xtrain, ytrain)
DecisionTreeRegressor()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
DecisionTreeRegressor()
# Training data evaluation
ypredtrain = dt_reg.predict(xtrain)
mse = mean_squared_error(ytrain, ypredtrain)
print("MSE :",mse)
rmse = np.sqrt(mse)
print("RMSE :",rmse)
r2 = r2_score(ytrain, ypredtrain)
print("R2 :",r2)
MSE : 0.0 RMSE : 0.0 R2 : 1.0
# Testing data Evaluation
ypred = dt_reg.predict(xtest)
mse = mean_squared_error(ytest, ypred)
print("MSE :",mse)
rmse = np.sqrt(mse)
print("RMSE :",rmse)
r2 = r2_score(ytest, ypred)
print("R2 :",r2)
MSE : 0.28532608695652173 RMSE : 0.5341592337089398 R2 : -1.119466842191871
# Model Training
svc_model = SVC()
svc_model.fit(xtrain,ytrain)
SVC()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
SVC()
# Testing data evaluation
ypredtest = svc_model.predict(xtest)
Accuracy = accuracy_score(ytest,ypredtest)
print('Accuracy:',Accuracy)
Confusion_matrix = confusion_matrix(ytest,ypredtest)
print('Confusion_matrix: \n',Confusion_matrix)
Classification_report = classification_report(ytest,ypredtest)
print('Classification_report: \n',Classification_report)
Accuracy: 0.8478260869565217
Confusion_matrix:
[[307 2]
[ 54 5]]
Classification_report:
precision recall f1-score support
0 0.85 0.99 0.92 309
1 0.71 0.08 0.15 59
accuracy 0.85 368
macro avg 0.78 0.54 0.53 368
weighted avg 0.83 0.85 0.79 368
# Training data evaluation
ypredtrain = svc_model.predict(xtrain)
Accuracy = accuracy_score(ytrain,ypredtrain)
print('Accuracy:',Accuracy)
Confusion_matrix = confusion_matrix(ytrain,ypredtrain)
print('Confusion_matrix: \n',Confusion_matrix)
Classification_report = classification_report(ytrain,ypredtrain)
print('Classification_report: \n',Classification_report)
Accuracy: 0.8720508166969148
Confusion_matrix:
[[921 3]
[138 40]]
Classification_report:
precision recall f1-score support
0 0.87 1.00 0.93 924
1 0.93 0.22 0.36 178
accuracy 0.87 1102
macro avg 0.90 0.61 0.65 1102
weighted avg 0.88 0.87 0.84 1102
# Hyperparameter tuning
svc_model = SVC()
hyp_grid = {'C':np.arange(0,50),
'kernel' : ['linear', 'poly', 'rbf', 'sigmoid']}
gscv_scv_model = GridSearchCV(svc_model,hyp_grid,cv=5)
gscv_scv_model.fit(xtrain,ytrain)
GridSearchCV(cv=5, estimator=SVC(),
param_grid={'C': array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]),
'kernel': ['linear', 'poly', 'rbf', 'sigmoid']})In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=5, estimator=SVC(),
param_grid={'C': array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]),
'kernel': ['linear', 'poly', 'rbf', 'sigmoid']})SVC()
SVC()
gscv_scv_model.best_estimator_
SVC(C=1, kernel='poly')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
SVC(C=1, kernel='poly')
svc_model = SVC(C=1, kernel='linear')
svc_model.fit(xtrain,ytrain)
SVC(C=1, kernel='linear')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
SVC(C=1, kernel='linear')
# Testing data evaluation
ypredtest = svc_model.predict(xtest)
Accuracy = accuracy_score(ytest,ypredtest)
print('Accuracy:',Accuracy)
Confusion_matrix = confusion_matrix(ytest,ypredtest)
print('Confusion_matrix: \n',Confusion_matrix)
Classification_report = classification_report(ytest,ypredtest)
print('Classification_report: \n',Classification_report)
Accuracy: 0.8396739130434783
Confusion_matrix:
[[309 0]
[ 59 0]]
Classification_report:
precision recall f1-score support
0 0.84 1.00 0.91 309
1 0.00 0.00 0.00 59
accuracy 0.84 368
macro avg 0.42 0.50 0.46 368
weighted avg 0.71 0.84 0.77 368
# Training data evaluation
ypredtrain = svc_model.predict(xtrain)
Accuracy = accuracy_score(ytrain,ypredtrain)
print('Accuracy:',Accuracy)
Confusion_matrix = confusion_matrix(ytrain,ypredtrain)
print('Confusion_matrix: \n',Confusion_matrix)
Classification_report = classification_report(ytrain,ypredtrain)
print('Classification_report: \n',Classification_report)
Accuracy: 0.838475499092559
Confusion_matrix:
[[924 0]
[178 0]]
Classification_report:
precision recall f1-score support
0 0.84 1.00 0.91 924
1 0.00 0.00 0.00 178
accuracy 0.84 1102
macro avg 0.42 0.50 0.46 1102
weighted avg 0.70 0.84 0.76 1102
# Model Building
import tensorflow as tf
import keras
from keras import Sequential
from keras.layers import Dense,Dropout
nn = Sequential()
# Create input layer and 1st Hidden layer
nn.add(Dense(units = 6,activation='relu',kernel_initializer='he_uniform',input_dim=(29)))
# Add 1st Dropout layer
nn.add(Dropout(rate=0.5))
# Add 2nd Hidden layer
nn.add(Dense(units = 6,activation='relu',kernel_initializer='he_uniform'))
# Add 2nd Dropout layer
nn.add(Dropout(rate=0.5))
# Add output layer
nn.add(Dense(units=1 ,activation='sigmoid', kernel_initializer='glorot_uniform'))
# Compile
nn.compile(optimizer='Adam',loss='binary_crossentropy' ,metrics=['accuracy','Recall','Precision'])
# Training
model_train= nn.fit(xtrain,ytrain,validation_split=0.2,epochs=100,batch_size=20)
Epoch 1/100 45/45 [==============================] - 2s 10ms/step - loss: 0.7820 - accuracy: 0.7242 - recall: 0.1942 - precision: 0.1709 - val_loss: 0.6188 - val_accuracy: 0.7602 - val_recall: 0.2821 - val_precision: 0.3056 Epoch 2/100 45/45 [==============================] - 0s 3ms/step - loss: 0.6897 - accuracy: 0.7582 - recall: 0.1655 - precision: 0.1917 - val_loss: 0.5879 - val_accuracy: 0.8416 - val_recall: 0.2051 - val_precision: 0.6667 Epoch 3/100 45/45 [==============================] - 0s 3ms/step - loss: 0.6642 - accuracy: 0.7696 - recall: 0.1295 - precision: 0.1800 - val_loss: 0.5646 - val_accuracy: 0.8235 - val_recall: 0.0256 - val_precision: 0.5000 Epoch 4/100 45/45 [==============================] - 0s 3ms/step - loss: 0.6437 - accuracy: 0.7877 - recall: 0.1511 - precision: 0.2333 - val_loss: 0.5488 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 5/100 45/45 [==============================] - 0s 3ms/step - loss: 0.5764 - accuracy: 0.8036 - recall: 0.1007 - precision: 0.2258 - val_loss: 0.5312 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 6/100 45/45 [==============================] - 0s 3ms/step - loss: 0.5815 - accuracy: 0.8184 - recall: 0.1007 - precision: 0.2857 - val_loss: 0.5173 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 7/100 45/45 [==============================] - 0s 3ms/step - loss: 0.5608 - accuracy: 0.8150 - recall: 0.0719 - precision: 0.2273 - val_loss: 0.5092 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 8/100 45/45 [==============================] - 0s 5ms/step - loss: 0.5603 - accuracy: 0.8070 - recall: 0.0432 - precision: 0.1395 - val_loss: 0.5015 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 9/100 45/45 [==============================] - 0s 4ms/step - loss: 0.5463 - accuracy: 0.8297 - recall: 0.0288 - precision: 0.2105 - val_loss: 0.4936 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 10/100 45/45 [==============================] - 0s 4ms/step - loss: 0.5165 - accuracy: 0.8343 - recall: 0.0576 - precision: 0.3478 - val_loss: 0.4856 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 11/100 45/45 [==============================] - 0s 4ms/step - loss: 0.5143 - accuracy: 0.8365 - recall: 0.0360 - precision: 0.3333 - val_loss: 0.4780 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 12/100 45/45 [==============================] - 0s 4ms/step - loss: 0.5000 - accuracy: 0.8388 - recall: 0.0288 - precision: 0.3636 - val_loss: 0.4719 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 13/100 45/45 [==============================] - 0s 5ms/step - loss: 0.4635 - accuracy: 0.8388 - recall: 0.0144 - precision: 0.2857 - val_loss: 0.4664 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 14/100 45/45 [==============================] - 0s 4ms/step - loss: 0.4849 - accuracy: 0.8331 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4618 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 15/100 45/45 [==============================] - 0s 4ms/step - loss: 0.4845 - accuracy: 0.8343 - recall: 0.0072 - precision: 0.1111 - val_loss: 0.4566 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 16/100 45/45 [==============================] - 0s 4ms/step - loss: 0.4765 - accuracy: 0.8388 - recall: 0.0144 - precision: 0.2857 - val_loss: 0.4528 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 17/100 45/45 [==============================] - 0s 4ms/step - loss: 0.4894 - accuracy: 0.8365 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4501 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 18/100 45/45 [==============================] - 0s 5ms/step - loss: 0.4667 - accuracy: 0.8365 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4474 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 19/100 45/45 [==============================] - 0s 4ms/step - loss: 0.4559 - accuracy: 0.8400 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4430 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 20/100 45/45 [==============================] - 0s 4ms/step - loss: 0.4641 - accuracy: 0.8411 - recall: 0.0072 - precision: 0.3333 - val_loss: 0.4423 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 21/100 45/45 [==============================] - 0s 5ms/step - loss: 0.4657 - accuracy: 0.8422 - recall: 0.0072 - precision: 0.5000 - val_loss: 0.4409 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 22/100 45/45 [==============================] - 0s 3ms/step - loss: 0.4454 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4398 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 23/100 45/45 [==============================] - 0s 3ms/step - loss: 0.4594 - accuracy: 0.8411 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4381 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 24/100 45/45 [==============================] - 0s 3ms/step - loss: 0.4510 - accuracy: 0.8411 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4368 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 25/100 45/45 [==============================] - 0s 3ms/step - loss: 0.4536 - accuracy: 0.8434 - recall: 0.0072 - precision: 1.0000 - val_loss: 0.4362 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 26/100 45/45 [==============================] - 0s 4ms/step - loss: 0.4648 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4358 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 27/100 45/45 [==============================] - 0s 3ms/step - loss: 0.4404 - accuracy: 0.8388 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4349 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 28/100 45/45 [==============================] - 0s 3ms/step - loss: 0.4421 - accuracy: 0.8411 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4347 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 29/100 45/45 [==============================] - 0s 3ms/step - loss: 0.4382 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4339 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 30/100 45/45 [==============================] - 0s 3ms/step - loss: 0.4371 - accuracy: 0.8411 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4335 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 31/100 45/45 [==============================] - 0s 3ms/step - loss: 0.4403 - accuracy: 0.8422 - recall: 0.0072 - precision: 0.5000 - val_loss: 0.4344 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 32/100 45/45 [==============================] - 0s 3ms/step - loss: 0.4349 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4335 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 33/100 45/45 [==============================] - 0s 3ms/step - loss: 0.4197 - accuracy: 0.8411 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4315 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 34/100 45/45 [==============================] - 0s 3ms/step - loss: 0.4419 - accuracy: 0.8400 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4314 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 35/100 45/45 [==============================] - 0s 3ms/step - loss: 0.4475 - accuracy: 0.8422 - recall: 0.0072 - precision: 0.5000 - val_loss: 0.4328 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 36/100 45/45 [==============================] - 0s 3ms/step - loss: 0.4359 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4329 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 37/100 45/45 [==============================] - 0s 3ms/step - loss: 0.4391 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4339 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 38/100 45/45 [==============================] - 0s 4ms/step - loss: 0.4329 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4330 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 39/100 45/45 [==============================] - 0s 3ms/step - loss: 0.4217 - accuracy: 0.8411 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4327 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 40/100 45/45 [==============================] - 0s 3ms/step - loss: 0.4311 - accuracy: 0.8434 - recall: 0.0072 - precision: 1.0000 - val_loss: 0.4334 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 41/100 45/45 [==============================] - 0s 3ms/step - loss: 0.4287 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4323 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 42/100 45/45 [==============================] - 0s 3ms/step - loss: 0.4163 - accuracy: 0.8411 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4312 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 43/100 45/45 [==============================] - 0s 3ms/step - loss: 0.4315 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4312 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 44/100 45/45 [==============================] - 0s 3ms/step - loss: 0.4261 - accuracy: 0.8411 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4308 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 45/100 45/45 [==============================] - 0s 3ms/step - loss: 0.4284 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4306 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 46/100 45/45 [==============================] - 0s 3ms/step - loss: 0.4213 - accuracy: 0.8400 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4310 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 47/100 45/45 [==============================] - 0s 3ms/step - loss: 0.4313 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4321 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 48/100 45/45 [==============================] - 0s 3ms/step - loss: 0.4386 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4323 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 49/100 45/45 [==============================] - 0s 4ms/step - loss: 0.4193 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4332 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 50/100 45/45 [==============================] - 0s 4ms/step - loss: 0.4189 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4317 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 51/100 45/45 [==============================] - 0s 4ms/step - loss: 0.4091 - accuracy: 0.8411 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4324 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 52/100 45/45 [==============================] - 0s 3ms/step - loss: 0.4156 - accuracy: 0.8411 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4322 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 53/100 45/45 [==============================] - 0s 3ms/step - loss: 0.4353 - accuracy: 0.8411 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4330 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 54/100 45/45 [==============================] - 0s 3ms/step - loss: 0.4202 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4346 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 55/100 45/45 [==============================] - 0s 3ms/step - loss: 0.4289 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4356 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 56/100 45/45 [==============================] - 0s 3ms/step - loss: 0.4106 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4363 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 57/100 45/45 [==============================] - 0s 3ms/step - loss: 0.4206 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4371 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 58/100 45/45 [==============================] - 0s 3ms/step - loss: 0.4136 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4371 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 59/100 45/45 [==============================] - 0s 3ms/step - loss: 0.4102 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4371 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 60/100 45/45 [==============================] - 0s 3ms/step - loss: 0.4167 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4374 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 61/100 45/45 [==============================] - 0s 3ms/step - loss: 0.4187 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4380 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 62/100 45/45 [==============================] - 0s 3ms/step - loss: 0.4080 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4387 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 63/100 45/45 [==============================] - 0s 3ms/step - loss: 0.4085 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4401 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 64/100 45/45 [==============================] - 0s 3ms/step - loss: 0.4196 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4384 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 65/100 45/45 [==============================] - 0s 3ms/step - loss: 0.4223 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4379 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 66/100 45/45 [==============================] - 0s 3ms/step - loss: 0.4198 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4371 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 67/100 45/45 [==============================] - 0s 3ms/step - loss: 0.4205 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4380 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 68/100 45/45 [==============================] - 0s 3ms/step - loss: 0.4081 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4371 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 69/100 45/45 [==============================] - 0s 3ms/step - loss: 0.4197 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4375 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 70/100 45/45 [==============================] - 0s 3ms/step - loss: 0.4095 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4383 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 71/100 45/45 [==============================] - 0s 3ms/step - loss: 0.4032 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4383 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 72/100 45/45 [==============================] - 0s 3ms/step - loss: 0.4121 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4374 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 73/100 45/45 [==============================] - 0s 3ms/step - loss: 0.4096 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4377 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 74/100 45/45 [==============================] - 0s 3ms/step - loss: 0.4055 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4402 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 75/100 45/45 [==============================] - 0s 3ms/step - loss: 0.4002 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4416 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 76/100 45/45 [==============================] - 0s 3ms/step - loss: 0.4028 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4400 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 77/100 45/45 [==============================] - 0s 3ms/step - loss: 0.4061 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4389 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 78/100 45/45 [==============================] - 0s 3ms/step - loss: 0.3982 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4400 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 79/100 45/45 [==============================] - 0s 3ms/step - loss: 0.4066 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4419 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 80/100 45/45 [==============================] - 0s 4ms/step - loss: 0.4022 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4416 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 81/100 45/45 [==============================] - 0s 3ms/step - loss: 0.4022 - accuracy: 0.8434 - recall: 0.0072 - precision: 1.0000 - val_loss: 0.4401 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 82/100 45/45 [==============================] - 0s 3ms/step - loss: 0.3980 - accuracy: 0.8445 - recall: 0.0144 - precision: 1.0000 - val_loss: 0.4398 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 83/100 45/45 [==============================] - 0s 3ms/step - loss: 0.4137 - accuracy: 0.8434 - recall: 0.0072 - precision: 1.0000 - val_loss: 0.4415 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 84/100 45/45 [==============================] - 0s 4ms/step - loss: 0.4042 - accuracy: 0.8411 - recall: 0.0072 - precision: 0.3333 - val_loss: 0.4415 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 85/100 45/45 [==============================] - 0s 4ms/step - loss: 0.3928 - accuracy: 0.8434 - recall: 0.0072 - precision: 1.0000 - val_loss: 0.4411 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 86/100 45/45 [==============================] - 0s 3ms/step - loss: 0.4048 - accuracy: 0.8422 - recall: 0.0072 - precision: 0.5000 - val_loss: 0.4424 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 87/100 45/45 [==============================] - 0s 3ms/step - loss: 0.4015 - accuracy: 0.8400 - recall: 0.0072 - precision: 0.2500 - val_loss: 0.4437 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 88/100 45/45 [==============================] - 0s 4ms/step - loss: 0.3983 - accuracy: 0.8445 - recall: 0.0144 - precision: 1.0000 - val_loss: 0.4448 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 89/100 45/45 [==============================] - 0s 5ms/step - loss: 0.3965 - accuracy: 0.8502 - recall: 0.0504 - precision: 1.0000 - val_loss: 0.4444 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 90/100 45/45 [==============================] - 0s 5ms/step - loss: 0.3985 - accuracy: 0.8479 - recall: 0.0576 - precision: 0.7273 - val_loss: 0.4441 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 91/100 45/45 [==============================] - 0s 4ms/step - loss: 0.3975 - accuracy: 0.8445 - recall: 0.0432 - precision: 0.6000 - val_loss: 0.4457 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 92/100 45/45 [==============================] - 0s 4ms/step - loss: 0.3998 - accuracy: 0.8445 - recall: 0.0360 - precision: 0.6250 - val_loss: 0.4469 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 93/100 45/45 [==============================] - 0s 4ms/step - loss: 0.3867 - accuracy: 0.8468 - recall: 0.0288 - precision: 1.0000 - val_loss: 0.4471 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 94/100 45/45 [==============================] - 0s 5ms/step - loss: 0.3866 - accuracy: 0.8468 - recall: 0.0432 - precision: 0.7500 - val_loss: 0.4496 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 95/100 45/45 [==============================] - 0s 4ms/step - loss: 0.3983 - accuracy: 0.8468 - recall: 0.0432 - precision: 0.7500 - val_loss: 0.4508 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 96/100 45/45 [==============================] - 0s 5ms/step - loss: 0.3904 - accuracy: 0.8422 - recall: 0.0288 - precision: 0.5000 - val_loss: 0.4485 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 97/100 45/45 [==============================] - 0s 4ms/step - loss: 0.4032 - accuracy: 0.8411 - recall: 0.0360 - precision: 0.4545 - val_loss: 0.4491 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 98/100 45/45 [==============================] - 0s 5ms/step - loss: 0.3996 - accuracy: 0.8422 - recall: 0.0288 - precision: 0.5000 - val_loss: 0.4503 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 99/100 45/45 [==============================] - 0s 5ms/step - loss: 0.3903 - accuracy: 0.8456 - recall: 0.0288 - precision: 0.8000 - val_loss: 0.4498 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00 Epoch 100/100 45/45 [==============================] - 0s 5ms/step - loss: 0.3906 - accuracy: 0.8434 - recall: 0.0360 - precision: 0.5556 - val_loss: 0.4499 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
# Model Testing
ypredtest = nn.predict(xtest)
12/12 [==============================] - 0s 1ms/step
ypredtest
array([[0.01510506],
[0.20445046],
[0.02313249],
[0.2447614 ],
[0.11240595],
[0.25176114],
[0.17864135],
[0.01808105],
[0.15272936],
[0.27237415],
[0.26029631],
[0.0557157 ],
[0.16247971],
[0.10556228],
[0.12715463],
[0.05700505],
[0.20292723],
[0.09968841],
[0.15977632],
[0.17864135],
[0.2798506 ],
[0.17864135],
[0.17864135],
[0.30207202],
[0.06405034],
[0.17864135],
[0.06013674],
[0.16053565],
[0.17589574],
[0.21343994],
[0.0364303 ],
[0.04026547],
[0.12827843],
[0.17406179],
[0.05245738],
[0.00125493],
[0.15511252],
[0.15144907],
[0.23954777],
[0.14012903],
[0.15522859],
[0.2233578 ],
[0.16509353],
[0.26715297],
[0.07053942],
[0.24002005],
[0.14947313],
[0.22769532],
[0.1019326 ],
[0.2713851 ],
[0.1695767 ],
[0.1945976 ],
[0.13116962],
[0.17864135],
[0.00316014],
[0.0790848 ],
[0.17864135],
[0.00331617],
[0.20549418],
[0.08482029],
[0.05072754],
[0.06374671],
[0.0651449 ],
[0.2939182 ],
[0.08386078],
[0.12554254],
[0.19114166],
[0.14072482],
[0.26029631],
[0.01196847],
[0.0507173 ],
[0.04790408],
[0.31563863],
[0.29325575],
[0.06633819],
[0.26757872],
[0.17632928],
[0.07789939],
[0.18047896],
[0.0004058 ],
[0.17864135],
[0.27902952],
[0.00927696],
[0.17864135],
[0.01344493],
[0.3010726 ],
[0.22933711],
[0.17232682],
[0.26729128],
[0.11951035],
[0.23603752],
[0.26029631],
[0.17864135],
[0.15779273],
[0.01229363],
[0.03191255],
[0.0187415 ],
[0.15875162],
[0.21060082],
[0.05792176],
[0.0768427 ],
[0.27470306],
[0.00513243],
[0.07764331],
[0.2619322 ],
[0.08740467],
[0.10199311],
[0.21328129],
[0.12665364],
[0.04587372],
[0.06382611],
[0.02262612],
[0.08895383],
[0.23007524],
[0.11267168],
[0.14078857],
[0.25319505],
[0.26330686],
[0.08358296],
[0.08433616],
[0.10481097],
[0.17864135],
[0.17864135],
[0.1491308 ],
[0.14289218],
[0.09642561],
[0.08585224],
[0.17210375],
[0.17864135],
[0.05308583],
[0.17864135],
[0.25155568],
[0.01316426],
[0.03361205],
[0.1646049 ],
[0.17024504],
[0.24575444],
[0.17864135],
[0.17940202],
[0.14060722],
[0.17864135],
[0.08208651],
[0.17864135],
[0.16486545],
[0.19501169],
[0.17864135],
[0.17864135],
[0.02165459],
[0.17864135],
[0.15567924],
[0.1198564 ],
[0.125373 ],
[0.26029631],
[0.15322718],
[0.04668615],
[0.1608112 ],
[0.02872171],
[0.06027906],
[0.18825325],
[0.2448634 ],
[0.17864135],
[0.07112506],
[0.15243876],
[0.09913176],
[0.17864135],
[0.09134295],
[0.27147523],
[0.11566301],
[0.17864135],
[0.17864135],
[0.2664947 ],
[0.08195227],
[0.11174984],
[0.01494817],
[0.20882936],
[0.15095532],
[0.00298889],
[0.18600288],
[0.08491654],
[0.20261735],
[0.14526552],
[0.17864135],
[0.21982045],
[0.2440781 ],
[0.26029631],
[0.18553853],
[0.21339819],
[0.28174663],
[0.23102461],
[0.26029631],
[0.22908273],
[0.26029631],
[0.19858061],
[0.1377052 ],
[0.03284998],
[0.26029631],
[0.1392803 ],
[0.17864135],
[0.20481087],
[0.17864135],
[0.27247092],
[0.24391419],
[0.01871445],
[0.13344584],
[0.09237079],
[0.00121117],
[0.08360118],
[0.00102865],
[0.16539492],
[0.27777648],
[0.25430283],
[0.0637172 ],
[0.17864135],
[0.03634345],
[0.26332825],
[0.27286452],
[0.30843747],
[0.08077957],
[0.26658297],
[0.17102545],
[0.15640703],
[0.17864135],
[0.13594404],
[0.05219484],
[0.00595472],
[0.0783316 ],
[0.17864135],
[0.2593516 ],
[0.16748501],
[0.06541692],
[0.23679696],
[0.00642795],
[0.2756765 ],
[0.0244295 ],
[0.03046309],
[0.17864135],
[0.06599958],
[0.1735468 ],
[0.26029631],
[0.01298711],
[0.17864135],
[0.14433551],
[0.22224829],
[0.26916125],
[0.01386248],
[0.0545248 ],
[0.10220438],
[0.28250805],
[0.1060359 ],
[0.17695737],
[0.10343977],
[0.17864135],
[0.05588656],
[0.2539565 ],
[0.03529732],
[0.02105915],
[0.1310736 ],
[0.16897002],
[0.17864135],
[0.16364472],
[0.08125693],
[0.16763338],
[0.17864135],
[0.15711759],
[0.19832133],
[0.08390985],
[0.23529173],
[0.03127239],
[0.17864135],
[0.26029631],
[0.22231479],
[0.17864135],
[0.21266054],
[0.23025179],
[0.18405075],
[0.16127308],
[0.16364063],
[0.13972007],
[0.01925687],
[0.27160215],
[0.30919692],
[0.00588664],
[0.01169947],
[0.09484629],
[0.26029631],
[0.17864135],
[0.14511535],
[0.18220787],
[0.0388039 ],
[0.17864135],
[0.09844014],
[0.04057379],
[0.1337107 ],
[0.0125258 ],
[0.16890198],
[0.29679382],
[0.09167103],
[0.02465587],
[0.17864135],
[0.05458304],
[0.08702319],
[0.03677688],
[0.17675959],
[0.00092281],
[0.0072814 ],
[0.14503984],
[0.10235583],
[0.17864135],
[0.01230869],
[0.23082604],
[0.00960664],
[0.11529443],
[0.03483866],
[0.15413877],
[0.07259345],
[0.17864135],
[0.17864135],
[0.14462319],
[0.17057335],
[0.17864135],
[0.15546933],
[0.14479683],
[0.17864135],
[0.15372086],
[0.06519894],
[0.17864135],
[0.0192605 ],
[0.04998954],
[0.09350831],
[0.120751 ],
[0.17354558],
[0.07330868],
[0.07280812],
[0.17864135],
[0.09785005],
[0.17864135],
[0.10088717],
[0.05949574],
[0.15908647],
[0.1254853 ],
[0.14708665],
[0.0614772 ],
[0.13817297],
[0.02034603],
[0.1802322 ],
[0.2787524 ],
[0.29827794],
[0.16202964],
[0.03965331],
[0.24010457],
[0.07203472],
[0.01714903],
[0.11134897],
[0.27343938],
[0.00378489],
[0.03697024],
[0.02776049],
[0.06400913],
[0.19636442],
[0.14231376],
[0.12078112],
[0.1963701 ],
[0.02792522],
[0.17864135],
[0.0022663 ],
[0.23789994],
[0.17864135],
[0.24822074]], dtype=float32)
ypredtest = list(map(int,(ypredtest>0.4)))
print(ypredtest)
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Accuracy = accuracy_score(ytest,ypredtest)
Accuracy
0.8396739130434783
final_y = pd.DataFrame(ypredtest)
final_pred = final_y.replace({1:'Y',0:'N'})
final_pred
| 0 | |
|---|---|
| 0 | N |
| 1 | N |
| 2 | N |
| 3 | N |
| 4 | N |
| ... | ... |
| 363 | N |
| 364 | N |
| 365 | N |
| 366 | N |
| 367 | N |
368 rows × 1 columns
# choose prediction error and prediction of best model is logistic Model
# accuracy = 0.8586
ypredtest = log_model.predict(xtest)
ypredtest
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
Final_pred = pd.DataFrame(ypredtest)
Final_pred
| 0 | |
|---|---|
| 0 | 0 |
| 1 | 0 |
| 2 | 0 |
| 3 | 0 |
| 4 | 0 |
| ... | ... |
| 363 | 0 |
| 364 | 0 |
| 365 | 0 |
| 366 | 0 |
| 367 | 0 |
368 rows × 1 columns
# 9. Clustering - Find intersting clusters using K-Means, Heirarchical and DBSCAN clustering.
# Connect to Domain scenario and its usefulness in analysis (Ignore Attrition column)
final_log.head()
| BusinessTravel_Non-Travel | BusinessTravel_Travel_Frequently | Department_Research & Development | Department_Sales | EducationField_Marketing | EducationField_Technical Degree | JobRole_Healthcare Representative | JobRole_Laboratory Technician | JobRole_Manager | JobRole_Manufacturing Director | ... | JobLevel | JobSatisfaction | MonthlyIncome | StandardHours | StockOptionLevel | TotalWorkingYears | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | -0.057788 | 1.153254 | 0.129018 | 0.0 | -1.018674 | -0.374906 | -2.493820 | 0.041137 | -0.018341 | 0.294570 |
| 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | -0.057788 | -0.660853 | -0.140791 | 0.0 | 0.510149 | -0.057867 | 0.338096 | 1.069787 | 0.882230 | 0.888852 |
| 2 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | -0.961486 | 0.246200 | -1.091220 | 0.0 | -1.018674 | -0.533426 | 0.338096 | -1.501837 | -1.219103 | -1.191138 |
| 3 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | -0.961486 | 0.246200 | -0.835167 | 0.0 | -1.018674 | -0.374906 | 0.338096 | 0.555462 | 0.882230 | -1.191138 |
| 4 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | -0.961486 | -0.660853 | -0.660400 | 0.0 | 0.510149 | -0.691946 | 0.338096 | -0.987512 | -0.618722 | -0.596855 |
5 rows × 29 columns
# Model Building
kmeans_model = KMeans(n_clusters=5) # n_clusters=8 by default
kmeans_model.fit(x_log)
KMeans(n_clusters=5)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
KMeans(n_clusters=5)
# With cluster some of squares
kmeans_model.inertia_
28488.260625226452
# Labeling Variable
C_var = kmeans_model.fit_predict(x_log)
C_var
array([3, 4, 3, ..., 4, 2, 0], dtype=int32)
# Cluster = 0
index_0 = np.where(C_var==0)
index_0
(array([ 3, 4, 10, 12, 13, 16, 17, 19, 20, 23, 30,
31, 32, 35, 37, 38, 39, 40, 41, 49, 53, 54,
57, 66, 68, 72, 74, 81, 84, 86, 97, 99, 101,
104, 108, 109, 113, 114, 115, 125, 128, 130, 138, 141,
143, 145, 146, 149, 156, 159, 161, 162, 164, 170, 174,
175, 176, 179, 181, 183, 184, 191, 193, 196, 197, 198,
200, 202, 203, 206, 207, 208, 224, 225, 230, 232, 238,
240, 241, 242, 246, 248, 249, 252, 253, 254, 255, 258,
262, 265, 267, 273, 274, 284, 287, 292, 294, 298, 299,
301, 302, 309, 310, 312, 318, 320, 328, 330, 331, 333,
337, 340, 345, 346, 347, 349, 350, 351, 354, 358, 365,
369, 371, 372, 373, 374, 377, 380, 381, 383, 387, 388,
389, 393, 396, 397, 399, 402, 404, 409, 413, 419, 428,
430, 431, 437, 438, 441, 449, 450, 454, 460, 461, 470,
471, 474, 478, 481, 483, 485, 486, 487, 488, 490, 493,
494, 496, 499, 500, 505, 507, 511, 512, 515, 516, 520,
522, 539, 542, 543, 546, 548, 549, 550, 555, 556, 557,
559, 560, 565, 570, 571, 572, 574, 575, 576, 577, 579,
580, 583, 586, 596, 597, 599, 601, 603, 605, 606, 613,
615, 617, 618, 620, 622, 623, 626, 628, 629, 632, 634,
637, 638, 639, 640, 642, 643, 644, 648, 650, 654, 655,
657, 659, 665, 668, 670, 671, 672, 673, 678, 679, 680,
684, 691, 694, 697, 698, 703, 712, 713, 715, 717, 719,
722, 724, 726, 727, 734, 735, 737, 739, 742, 747, 754,
759, 763, 765, 767, 769, 772, 781, 782, 786, 790, 793,
794, 795, 802, 803, 811, 815, 816, 818, 819, 820, 822,
823, 824, 826, 827, 830, 832, 833, 835, 839, 840, 845,
848, 850, 854, 856, 859, 862, 863, 865, 866, 868, 876,
877, 878, 884, 885, 893, 895, 901, 902, 903, 906, 909,
917, 921, 924, 925, 929, 931, 933, 934, 938, 957, 961,
965, 970, 972, 973, 982, 984, 986, 988, 989, 990, 991,
993, 996, 998, 1000, 1001, 1002, 1003, 1004, 1011, 1013, 1015,
1017, 1019, 1020, 1022, 1025, 1026, 1027, 1028, 1035, 1037, 1038,
1041, 1042, 1045, 1046, 1047, 1049, 1051, 1052, 1059, 1061, 1065,
1066, 1067, 1069, 1070, 1072, 1074, 1079, 1082, 1088, 1091, 1097,
1098, 1101, 1104, 1105, 1107, 1108, 1109, 1113, 1115, 1117, 1120,
1121, 1123, 1125, 1127, 1128, 1132, 1133, 1134, 1137, 1139, 1141,
1144, 1145, 1152, 1158, 1161, 1168, 1169, 1170, 1172, 1173, 1178,
1180, 1182, 1189, 1191, 1192, 1197, 1199, 1200, 1202, 1207, 1211,
1215, 1217, 1219, 1224, 1226, 1227, 1228, 1229, 1230, 1233, 1234,
1238, 1241, 1245, 1247, 1248, 1250, 1252, 1254, 1256, 1258, 1259,
1261, 1270, 1272, 1276, 1283, 1285, 1286, 1287, 1292, 1293, 1294,
1299, 1302, 1306, 1309, 1311, 1317, 1319, 1321, 1323, 1324, 1325,
1329, 1335, 1337, 1342, 1343, 1345, 1349, 1352, 1355, 1358, 1359,
1360, 1362, 1366, 1367, 1371, 1376, 1378, 1380, 1381, 1382, 1387,
1388, 1391, 1397, 1400, 1402, 1406, 1407, 1408, 1411, 1413, 1415,
1417, 1419, 1422, 1423, 1426, 1427, 1428, 1435, 1436, 1440, 1448,
1449, 1453, 1454, 1455, 1456, 1457, 1458, 1459, 1460, 1464, 1465,
1469]),)
cluster_0 = final_log.iloc[index_0]
cluster_0
| BusinessTravel_Non-Travel | BusinessTravel_Travel_Frequently | Department_Research & Development | Department_Sales | EducationField_Marketing | EducationField_Technical Degree | JobRole_Healthcare Representative | JobRole_Laboratory Technician | JobRole_Manager | JobRole_Manufacturing Director | ... | JobLevel | JobSatisfaction | MonthlyIncome | StandardHours | StockOptionLevel | TotalWorkingYears | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 3 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | -0.961486 | 0.246200 | -0.835167 | 0.0 | -1.018674 | -0.374906 | 0.338096 | 0.555462 | 0.882230 | -1.191138 |
| 4 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | -0.961486 | -0.660853 | -0.660400 | 0.0 | 0.510149 | -0.691946 | 0.338096 | -0.987512 | -0.618722 | -0.596855 |
| 10 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | -0.961486 | -0.660853 | -0.986172 | 0.0 | 0.510149 | -0.691946 | 0.338096 | -0.216025 | -0.018341 | -0.299713 |
| 12 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | -0.961486 | 0.246200 | -0.834541 | 0.0 | 0.510149 | -0.850466 | -1.077862 | -0.216025 | -0.618722 | -0.299713 |
| 13 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | -0.961486 | 1.153254 | -0.912702 | 0.0 | 0.510149 | -1.167505 | 0.338096 | -0.987512 | -0.618722 | -0.596855 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1459 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | -0.057788 | -0.660853 | -0.486259 | 0.0 | 0.510149 | -0.057867 | 0.338096 | -0.473188 | -0.318532 | -0.299713 |
| 1460 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | -0.961486 | -1.567907 | -0.561293 | 0.0 | -1.018674 | -0.850466 | -2.493820 | -0.216025 | -0.018341 | -0.002572 |
| 1464 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | -0.961486 | 0.246200 | -0.817346 | 0.0 | -1.018674 | -0.850466 | 0.338096 | -0.473188 | -0.618722 | -1.191138 |
| 1465 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | -0.057788 | 1.153254 | -0.940839 | 0.0 | 0.510149 | 1.051772 | 0.338096 | -0.216025 | -0.618722 | -0.299713 |
| 1469 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | -0.057788 | 0.246200 | -0.367768 | 0.0 | -1.018674 | -0.691946 | 1.754054 | -0.473188 | -0.318532 | -0.596855 |
529 rows × 29 columns
# cluster=1
index_1 = np.where(C_var==1)
index_1
(array([ 18, 25, 28, 29, 45, 62, 63, 65, 67, 70, 77,
78, 82, 85, 90, 93, 95, 98, 106, 112, 123, 126,
131, 133, 136, 147, 163, 165, 186, 187, 189, 190, 209,
213, 215, 218, 219, 231, 233, 235, 237, 244, 245, 257,
263, 268, 270, 271, 275, 276, 279, 280, 290, 295, 300,
307, 308, 313, 314, 316, 326, 329, 332, 348, 360, 367,
376, 379, 390, 392, 400, 401, 406, 408, 411, 417, 420,
424, 425, 427, 429, 433, 435, 448, 455, 458, 464, 465,
466, 473, 477, 489, 492, 497, 510, 534, 535, 536, 538,
541, 544, 552, 561, 568, 584, 588, 592, 595, 616, 624,
625, 646, 649, 653, 674, 677, 695, 699, 701, 706, 716,
721, 723, 728, 736, 738, 741, 743, 750, 753, 755, 758,
766, 770, 771, 774, 775, 787, 789, 799, 806, 810, 812,
813, 814, 821, 838, 851, 858, 869, 887, 890, 894, 897,
898, 899, 904, 905, 907, 913, 914, 916, 919, 922, 926,
937, 945, 954, 955, 956, 962, 966, 971, 975, 976, 987,
994, 999, 1008, 1010, 1014, 1024, 1031, 1034, 1043, 1044, 1054,
1062, 1076, 1078, 1080, 1086, 1093, 1096, 1111, 1126, 1135, 1138,
1140, 1154, 1164, 1176, 1177, 1194, 1195, 1196, 1203, 1223, 1225,
1235, 1242, 1264, 1268, 1275, 1277, 1301, 1303, 1305, 1310, 1327,
1331, 1348, 1351, 1370, 1374, 1377, 1396, 1401, 1403, 1437, 1443,
1461, 1462]),)
cluster_1 = final_log.iloc[index_1]
cluster_1
| BusinessTravel_Non-Travel | BusinessTravel_Travel_Frequently | Department_Research & Development | Department_Sales | EducationField_Marketing | EducationField_Technical Degree | JobRole_Healthcare Representative | JobRole_Laboratory Technician | JobRole_Manager | JobRole_Manufacturing Director | ... | JobLevel | JobSatisfaction | MonthlyIncome | StandardHours | StockOptionLevel | TotalWorkingYears | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 18 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 1.749610 | 1.153254 | 3.078475 | 0.0 | -1.018674 | 0.144974 | 0.338096 | 0.300399 | 1.182421 | 0.888852 |
| 25 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 2.653309 | 0.246200 | 0.288444 | 0.0 | 0.510149 | 2.478450 | -1.077862 | 2.098437 | 2.683373 | 1.185994 |
| 28 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0.845911 | 1.153254 | 1.459306 | 0.0 | 0.510149 | 2.161410 | 0.338096 | 0.300399 | 0.582040 | 0.034015 |
| 29 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | ... | 2.653309 | -1.567907 | 0.288444 | 0.0 | -1.018674 | 1.844371 | -1.077862 | -0.987512 | -0.618722 | -0.893996 |
| 45 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | ... | 2.653309 | 0.246200 | 0.288444 | 0.0 | -1.018674 | 2.002890 | 0.338096 | 0.300399 | 0.050478 | 1.185994 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1403 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 1.749610 | -1.567907 | 2.426305 | 0.0 | -1.018674 | 1.685851 | 0.338096 | 0.300399 | 1.182421 | 1.780277 |
| 1437 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 2.653309 | 1.153254 | 0.288444 | 0.0 | -1.018674 | 1.685851 | -1.077862 | 0.041137 | -1.219103 | -0.299713 |
| 1443 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 2.653309 | 0.246200 | 0.288444 | 0.0 | -1.018674 | 2.161410 | -1.077862 | 0.300399 | 0.582040 | 2.968843 |
| 1461 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0.845911 | -1.567907 | 1.648767 | 0.0 | 0.510149 | 1.527331 | 0.338096 | -0.730350 | -0.618722 | -1.191138 |
| 1462 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 1.749610 | 1.153254 | 2.016746 | 0.0 | 0.510149 | 1.685851 | -1.077862 | 0.300399 | 1.482611 | 0.591711 |
233 rows × 29 columns
# cluster=2
index_2 = np.where(C_var==2)
index_2
(array([ 5, 9, 11, 15, 22, 27, 43, 55, 59, 61, 64,
73, 76, 80, 83, 88, 89, 92, 94, 103, 116, 117,
119, 121, 124, 129, 137, 139, 144, 150, 151, 153, 154,
155, 158, 166, 167, 172, 173, 185, 188, 195, 199, 201,
210, 212, 220, 222, 223, 226, 227, 228, 243, 247, 256,
261, 269, 282, 283, 285, 291, 293, 297, 303, 304, 305,
306, 311, 315, 317, 319, 321, 322, 324, 334, 338, 339,
341, 342, 343, 344, 353, 355, 359, 361, 366, 384, 386,
394, 398, 403, 412, 423, 426, 432, 442, 444, 446, 447,
452, 459, 462, 467, 468, 472, 491, 503, 506, 508, 514,
519, 523, 524, 526, 527, 529, 530, 531, 532, 533, 551,
554, 558, 562, 564, 569, 578, 590, 593, 600, 604, 607,
608, 610, 621, 635, 636, 641, 647, 652, 658, 664, 675,
681, 685, 686, 690, 692, 693, 696, 702, 704, 705, 707,
708, 710, 718, 729, 730, 733, 745, 749, 751, 752, 756,
757, 760, 768, 773, 779, 780, 783, 784, 788, 796, 805,
807, 809, 825, 836, 837, 844, 846, 852, 855, 870, 872,
873, 874, 879, 881, 883, 888, 889, 891, 896, 900, 908,
920, 923, 927, 928, 930, 932, 935, 941, 942, 943, 944,
947, 949, 950, 951, 958, 959, 960, 963, 964, 968, 969,
979, 983, 985, 995, 997, 1005, 1007, 1018, 1029, 1030, 1033,
1040, 1048, 1050, 1055, 1063, 1073, 1081, 1084, 1085, 1089, 1090,
1094, 1095, 1099, 1103, 1106, 1114, 1119, 1122, 1124, 1130, 1131,
1142, 1143, 1146, 1147, 1148, 1149, 1150, 1155, 1156, 1157, 1159,
1160, 1162, 1163, 1165, 1174, 1179, 1181, 1187, 1188, 1190, 1204,
1208, 1210, 1212, 1214, 1216, 1218, 1220, 1231, 1232, 1240, 1244,
1251, 1253, 1260, 1265, 1267, 1269, 1274, 1278, 1280, 1281, 1282,
1288, 1289, 1295, 1296, 1304, 1318, 1322, 1330, 1334, 1340, 1341,
1346, 1350, 1357, 1361, 1363, 1364, 1368, 1373, 1385, 1386, 1392,
1393, 1395, 1399, 1404, 1409, 1410, 1412, 1416, 1418, 1421, 1424,
1425, 1429, 1430, 1431, 1434, 1439, 1444, 1450, 1451, 1463, 1466,
1468]),)
cluster_2 = final_log.iloc[index_2]
cluster_2
| BusinessTravel_Non-Travel | BusinessTravel_Travel_Frequently | Department_Research & Development | Department_Sales | EducationField_Marketing | EducationField_Technical Degree | JobRole_Healthcare Representative | JobRole_Laboratory Technician | JobRole_Manager | JobRole_Manufacturing Director | ... | JobLevel | JobSatisfaction | MonthlyIncome | StandardHours | StockOptionLevel | TotalWorkingYears | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 5 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | -0.961486 | 1.153254 | -0.785457 | 0.0 | -1.018674 | -0.374906 | -1.077862 | 0.298300 | 0.882230 | 0.591711 |
| 9 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | -0.057788 | 0.246200 | -0.107338 | 0.0 | 2.038972 | 1.051772 | -1.077862 | 0.298300 | 0.882230 | 0.888852 |
| 11 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | -0.057788 | 0.246200 | -0.433736 | 0.0 | -1.018674 | -0.057867 | 0.338096 | 0.812625 | 0.281849 | 1.185994 |
| 15 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0.845911 | -1.567907 | 1.375519 | 0.0 | 0.510149 | -0.057867 | 0.338096 | 1.069787 | 1.482611 | 1.185994 |
| 22 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0.845911 | -0.660853 | 2.005178 | 0.0 | -1.018674 | 0.417693 | 0.338096 | 1.584112 | 0.582040 | 2.077418 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1450 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0.845911 | 1.153254 | 1.018170 | 0.0 | -1.018674 | -0.216386 | 0.338096 | 0.812625 | -1.219103 | 0.888852 |
| 1451 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | -0.057788 | 1.153254 | -0.074198 | 0.0 | 0.510149 | -0.057867 | 0.338096 | 1.069787 | 0.882230 | 1.483135 |
| 1463 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | ... | -0.057788 | -1.567907 | 1.361762 | 0.0 | -1.018674 | -0.057867 | 0.338096 | 0.812625 | -0.018341 | 0.888852 |
| 1466 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0.845911 | -1.567907 | 1.378958 | 0.0 | 0.510149 | -0.216386 | 0.338096 | 0.298300 | 0.882230 | 0.888852 |
| 1468 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | -0.057788 | -0.660853 | -0.059504 | 0.0 | -1.018674 | 1.051772 | -1.077862 | 0.812625 | 0.582040 | 1.185994 |
342 rows × 29 columns
# Clusters = 3
index_3 = np.where(C_var==3)
index_3
(array([ 0, 2, 14, 24, 33, 34, 36, 42, 50, 51, 69,
102, 107, 122, 127, 132, 171, 177, 192, 204, 214, 216,
217, 229, 234, 239, 250, 259, 264, 288, 296, 323, 327,
336, 357, 363, 368, 370, 378, 385, 405, 414, 415, 421,
422, 436, 439, 440, 443, 453, 457, 463, 469, 479, 480,
482, 495, 504, 513, 525, 528, 540, 547, 566, 573, 585,
589, 591, 598, 614, 645, 656, 660, 662, 663, 666, 667,
683, 688, 689, 709, 711, 720, 731, 732, 744, 748, 761,
762, 776, 777, 797, 798, 800, 801, 828, 829, 831, 842,
849, 857, 860, 864, 871, 892, 911, 915, 939, 940, 946,
952, 953, 980, 981, 1012, 1016, 1021, 1032, 1036, 1039, 1056,
1057, 1060, 1068, 1077, 1083, 1110, 1112, 1136, 1153, 1167, 1171,
1201, 1205, 1213, 1222, 1236, 1237, 1246, 1249, 1255, 1257, 1262,
1271, 1273, 1279, 1290, 1291, 1297, 1312, 1313, 1326, 1332, 1338,
1339, 1354, 1365, 1369, 1375, 1379, 1390, 1438, 1442, 1452]),)
# cluster=4
index_4 =np.where(C_var==4)
index_4
(array([ 1, 6, 7, 8, 21, 26, 44, 46, 47, 48, 52,
56, 58, 60, 71, 75, 79, 87, 91, 96, 100, 105,
110, 111, 118, 120, 134, 135, 140, 142, 148, 152, 157,
160, 168, 169, 178, 180, 182, 194, 205, 211, 221, 236,
251, 260, 266, 272, 277, 278, 281, 286, 289, 325, 335,
352, 356, 362, 364, 375, 382, 391, 395, 407, 410, 416,
418, 434, 445, 451, 456, 475, 476, 484, 498, 501, 502,
509, 517, 518, 521, 537, 545, 553, 563, 567, 581, 582,
587, 594, 602, 609, 611, 612, 619, 627, 630, 631, 633,
651, 661, 669, 676, 682, 687, 700, 714, 725, 740, 746,
764, 778, 785, 791, 792, 804, 808, 817, 834, 841, 843,
847, 853, 861, 867, 875, 880, 882, 886, 910, 912, 918,
936, 948, 967, 974, 977, 978, 992, 1006, 1009, 1023, 1053,
1058, 1064, 1071, 1075, 1087, 1092, 1100, 1102, 1116, 1118, 1129,
1151, 1166, 1175, 1183, 1184, 1185, 1186, 1193, 1198, 1206, 1209,
1221, 1239, 1243, 1263, 1266, 1284, 1298, 1300, 1307, 1308, 1314,
1315, 1316, 1320, 1328, 1333, 1336, 1344, 1347, 1353, 1356, 1372,
1383, 1384, 1389, 1394, 1398, 1405, 1414, 1420, 1432, 1433, 1441,
1445, 1446, 1447, 1467]),)
cluster_4 = final_log.iloc[index_4]
cluster_4
| BusinessTravel_Non-Travel | BusinessTravel_Travel_Frequently | Department_Research & Development | Department_Sales | EducationField_Marketing | EducationField_Technical Degree | JobRole_Healthcare Representative | JobRole_Laboratory Technician | JobRole_Manager | JobRole_Manufacturing Director | ... | JobLevel | JobSatisfaction | MonthlyIncome | StandardHours | StockOptionLevel | TotalWorkingYears | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | -0.057788 | -0.660853 | -0.140791 | 0.0 | 0.510149 | -0.057867 | 0.338096 | 1.069787 | 0.882230 | 0.888852 |
| 6 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | -0.961486 | -1.567907 | -0.909888 | 0.0 | 0.195024 | 0.259173 | -1.077862 | -1.244675 | -1.219103 | -1.191138 |
| 7 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | -0.961486 | 0.246200 | -0.902697 | 0.0 | 0.510149 | -1.484545 | 0.338096 | -1.244675 | -1.219103 | -1.191138 |
| 8 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0.845911 | 0.246200 | 1.233580 | 0.0 | -1.018674 | -0.057867 | 0.338096 | 0.812625 | 0.882230 | 1.185994 |
| 21 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | -0.961486 | -1.567907 | -0.679471 | 0.0 | -1.018674 | -0.057867 | 0.338096 | -0.216025 | -0.318532 | -0.299713 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1441 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | -0.057788 | 0.246200 | 0.226875 | 0.0 | 0.510149 | 0.417693 | -1.077862 | 1.841275 | 2.383182 | 1.483135 |
| 1445 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 1.749610 | -0.660853 | 2.497900 | 0.0 | 0.510149 | 1.685851 | 0.338096 | 0.300399 | 0.882230 | 1.780277 |
| 1446 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | ... | -0.057788 | 0.246200 | 0.353807 | 0.0 | 2.038972 | -0.374906 | 0.338096 | 0.555462 | 0.882230 | 0.888852 |
| 1447 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | ... | -0.057788 | 1.153254 | -0.054502 | 0.0 | 0.510149 | 0.734732 | -1.077862 | 2.355600 | 2.383182 | 2.077418 |
| 1467 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | ... | -0.057788 | -0.660853 | 0.175602 | 0.0 | 0.510149 | -0.691946 | 0.338096 | 0.041137 | -0.618722 | -0.299713 |
202 rows × 29 columns
WCSS_list1 = []
for k in range(1,10):
kmeans_model = KMeans(n_clusters=k)
kmeans_model.fit(x_log)
WCSS = kmeans_model.inertia_
WCSS_list1.append(WCSS)
print('K >>',k,'WCSS>>',WCSS)
K >> 1 WCSS>> 36749.99999999997 K >> 2 WCSS>> 32432.28022057883 K >> 3 WCSS>> 30728.82109411982 K >> 4 WCSS>> 29482.758915733095 K >> 5 WCSS>> 28490.18203277078 K >> 6 WCSS>> 27776.60942323769 K >> 7 WCSS>> 27144.416717879314 K >> 8 WCSS>> 26649.96132712636 K >> 9 WCSS>> 26359.154762882907
WCSS
26359.154762882907
# Elbow_method
k = range(1,10)
plt.plot(k,WCSS_list1)
plt.xlabel('Number of clusters(K)')
plt.ylabel('WCSS')
plt.title('Elbow method graph')
Text(0.5, 1.0, 'Elbow method graph')
# Predicted = 2
kmeans_model = KMeans(n_clusters=2)
kmeans_model.fit(x_log)
KMeans(n_clusters=2)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
KMeans(n_clusters=2)
kmeans_model = KMeans(n_clusters=2) # n_clusters=8 >> default
pred_k=kmeans_model.fit_predict(x_log)
pred_k
array([1, 0, 1, ..., 1, 0, 1], dtype=int32)
final_log
| BusinessTravel_Non-Travel | BusinessTravel_Travel_Frequently | Department_Research & Development | Department_Sales | EducationField_Marketing | EducationField_Technical Degree | JobRole_Healthcare Representative | JobRole_Laboratory Technician | JobRole_Manager | JobRole_Manufacturing Director | ... | JobLevel | JobSatisfaction | MonthlyIncome | StandardHours | StockOptionLevel | TotalWorkingYears | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | -0.057788 | 1.153254 | 0.129018 | 0.0 | -1.018674 | -0.374906 | -2.493820 | 0.041137 | -0.018341 | 0.294570 |
| 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | -0.057788 | -0.660853 | -0.140791 | 0.0 | 0.510149 | -0.057867 | 0.338096 | 1.069787 | 0.882230 | 0.888852 |
| 2 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | -0.961486 | 0.246200 | -1.091220 | 0.0 | -1.018674 | -0.533426 | 0.338096 | -1.501837 | -1.219103 | -1.191138 |
| 3 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | -0.961486 | 0.246200 | -0.835167 | 0.0 | -1.018674 | -0.374906 | 0.338096 | 0.555462 | 0.882230 | -1.191138 |
| 4 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | -0.961486 | -0.660853 | -0.660400 | 0.0 | 0.510149 | -0.691946 | 0.338096 | -0.987512 | -0.618722 | -0.596855 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1465 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | -0.057788 | 1.153254 | -0.940839 | 0.0 | 0.510149 | 1.051772 | 0.338096 | -0.216025 | -0.618722 | -0.299713 |
| 1466 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0.845911 | -1.567907 | 1.378958 | 0.0 | 0.510149 | -0.216386 | 0.338096 | 0.298300 | 0.882230 | 0.888852 |
| 1467 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | ... | -0.057788 | -0.660853 | 0.175602 | 0.0 | 0.510149 | -0.691946 | 0.338096 | 0.041137 | -0.618722 | -0.299713 |
| 1468 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | -0.057788 | -0.660853 | -0.059504 | 0.0 | -1.018674 | 1.051772 | -1.077862 | 0.812625 | 0.582040 | 1.185994 |
| 1469 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | -0.057788 | 0.246200 | -0.367768 | 0.0 | -1.018674 | -0.691946 | 1.754054 | -0.473188 | -0.318532 | -0.596855 |
1470 rows × 29 columns
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(x_log)
PCA(n_components=2)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA(n_components=2)
x_pca = pca.transform(x_log)
x_pca
array([[-0.49360165, 1.41619175],
[ 1.15378029, -1.55229841],
[-3.03698632, 1.36126884],
...,
[-0.98841934, -1.0507973 ],
[ 1.47493103, 0.30399305],
[-1.06652057, 0.13379506]])
# normalization of pca due to noise in clustering
x_pca_scaler = StandardScaler().fit(x_pca)
x_pca1 = x_pca_scaler.transform(x_pca)
x_pca1
array([[-0.24458175, 1.0159896 ],
[ 0.57170313, -1.11363383],
[-1.50483988, 0.97658738],
...,
[-0.48976606, -0.7538521 ],
[ 0.73083471, 0.21808754],
[-0.52846556, 0.09598586]])
H = pd.DataFrame(x_pca1)
H
| 0 | 1 | |
|---|---|---|
| 0 | -0.244582 | 1.015990 |
| 1 | 0.571703 | -1.113634 |
| 2 | -1.504840 | 0.976587 |
| 3 | -0.257197 | -0.558017 |
| 4 | -0.981771 | -0.244826 |
| ... | ... | ... |
| 1465 | -0.239128 | 0.387034 |
| 1466 | 0.702694 | 0.004628 |
| 1467 | -0.489766 | -0.753852 |
| 1468 | 0.730835 | 0.218088 |
| 1469 | -0.528466 | 0.095986 |
1470 rows × 2 columns
L = H.rename({0:'X1',1:'Y1'},axis=1)
L
| X1 | Y1 | |
|---|---|---|
| 0 | -0.244582 | 1.015990 |
| 1 | 0.571703 | -1.113634 |
| 2 | -1.504840 | 0.976587 |
| 3 | -0.257197 | -0.558017 |
| 4 | -0.981771 | -0.244826 |
| ... | ... | ... |
| 1465 | -0.239128 | 0.387034 |
| 1466 | 0.702694 | 0.004628 |
| 1467 | -0.489766 | -0.753852 |
| 1468 | 0.730835 | 0.218088 |
| 1469 | -0.528466 | 0.095986 |
1470 rows × 2 columns
# Visualaisation of Clustering
plt.scatter(L['X1'],L['Y1'],c=pred_k,s=30)
plt.title("visualisation of K-means")
plt.xlabel("X-AXIS")
plt.ylabel("Y-AXIS")
plt.show()
from sklearn.metrics import silhouette_score
silhouette_score(L,pred_k)
0.3733471110993441
# Dendrogram
import scipy.cluster.hierarchy as sch
plt.figure(figsize=(10,9))
dendrogram = sch.dendrogram(sch.linkage(L,method='ward'))
plt.title('dendrogram')
plt.xlabel('X-AXIS')
plt.ylabel('Euclidean distances')
plt.show()
from sklearn.cluster import AgglomerativeClustering
cluster = AgglomerativeClustering(n_clusters=2,affinity = 'euclidean',linkage='ward')
cluster.fit_predict(L)
array([0, 1, 0, ..., 0, 1, 0])
XC = cluster.fit_predict(L)
from sklearn.metrics import silhouette_score
silhouette_score(L,XC)
0.38182922187276147
# Visualisation of clustering
plt.scatter(L['X1'],L['Y1'],c=XC,s=30)
plt.title("Visualisation of Heirarchical clustering")
plt.xlabel("X-AXIS")
plt.ylabel("Y-AXIS")
plt.show()
from sklearn.cluster import DBSCAN
A = DBSCAN (eps=3.6,min_samples=8)
labels = A.fit_predict(x_log)
np.unique(labels)
array([-1, 0, 1, 2])
labels = A.fit_predict(x_log)
np.unique(labels)
array([-1, 0, 1, 2])
# VISUALISATION OF CLUSTER
plt.scatter(L['X1'],L['Y1'],c=labels,s=30)
plt.xlabel("X-AXIS")
plt.ylabel("Y-AXIS")
plt.title("Visualisation of DBSCAN ")
plt.show()
silhouette_score(L,labels)
-0.12086921127977733